def input_fn():
     vocab = MockVocab()
     input_pipeline = LmInputDataPipeline(vocab, batch_size=None)
     input_dataset = tf.data.Dataset.from_generator(input_generator,
                                                    output_types=tf.string)
     corpus = input_pipeline.load_data(input_dataset).repeat()
     corpus = input_pipeline.padded_batch(corpus, 3)
     return corpus
 def model_fn(features, labels, mode, params):
     vocab_copy = MockVocab()
     input_pipeline_copy = LmInputDataPipeline(vocab_copy)
     return get_autoregressor_model_fn(
         vocab_size,
         input_pipeline_copy.get_id_to_embedding_mapping())(features,
                                                            labels, mode,
                                                            params)
def test_load_no_batching():
    def input_generator():
        yield ["a", "b", "c"]
        yield ["c", "b"]

    expected_output = [
        (
            {
                "inputs":
                np.array([[0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
                          [0.0, 0.0, 0.0, 1.5, 2.5, 3.5],
                          [0.0, 0.0, 0.0, 4.5, 5.5, 6.5],
                          [0.0, 0.0, 0.0, 7.5, 8.5, 9.5]],
                         dtype=np.float32),
                "length":
                np.array(4, dtype=np.int32),
            },
            {
                "targets": np.array([4, 5, 6, 2], dtype=np.int32)
            },
        ),
        (
            {
                "inputs":
                np.array([
                    [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 7.5, 8.5, 9.5],
                    [0.0, 0.0, 0.0, 4.5, 5.5, 6.5],
                ],
                         dtype=np.float32),
                "length":
                np.array(3, dtype=np.int32),
            },
            {
                "targets": np.array([6, 5, 2], dtype=np.int32)
            },
        ),
    ]

    input_dataset = tf.data.Dataset.from_generator(input_generator,
                                                   output_types=tf.string)

    vocab = MockVocab()
    input_pipeline = LmInputDataPipeline(vocab, batch_size=None)
    input_data = input_pipeline.load_data(input_dataset)

    it = input_data.make_initializable_iterator()
    example = it.get_next()

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        sess.run(it.initializer)
        #sess.run(tf.global_variables_initializer())
        for _, expected in enumerate(expected_output):
            actual = sess.run(example)
            assert actual[0]["inputs"] == approx(expected[0]["inputs"])
            assert actual[0]["length"] == approx(expected[0]["length"])
            assert actual[1]["targets"] == approx(expected[1]["targets"])
 def create_input():
     input_pipe = LmInputDataPipeline(glove, 5)
     embedding_size = LmInputDataPipeline(
         glove, None)._vocab_generalized.vector_size()
     train_data = read_dataset_from_dir(data_dir, DatasetType.TRAIN,
                                        embedding_size)
     train_data = train_data.repeat().shuffle(1000, seed=0)
     train_data = input_pipe.padded_batch(train_data, BATCH_SIZE)
     return train_data
 def create_input():
     input_pipe = LmInputDataPipeline(glove, 5)
     embedding_size = LmInputDataPipeline(
         glove, None)._vocab_generalized.vector_size()
     train_data = read_dataset_from_dir(data_dir, subset, embedding_size)
     if take_first_n is not None:
         train_data = train_data.take(take_first_n)
     train_data = input_pipe.padded_batch(train_data, BATCH_SIZE)
     return train_data
def disambiguation_preprocessing(inputs):
    glove = Glove300()
    input_pipe = LmInputDataPipeline(glove)
    t_words = tf.placeholder(dtype=tf.string)
    t_vocab_ids = glove.word_to_id_op()(t_words)
    t_genralized_ids = input_pipe._vocab_generalized.vocab_id_to_generalized_id(
    )(t_vocab_ids)

    meanings_all = set()
    for sentence in inputs:
        for word in sentence:
            for meaning in word.split("^"):
                meanings_all.add(meaning)
    meanings_all = list(meanings_all)

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        sess.run(tf.global_variables_initializer())
        glove.after_session_created_hook_fn(sess)
        ids_all = sess.run(t_genralized_ids, feed_dict={t_words: meanings_all})

    mapping = {meaning: id for meaning, id in zip(meanings_all, ids_all)}

    sentences_as_ids = []
    for sentence in inputs:
        sentence_as_ids = []
        for word in sentence:
            allowables = []
            for meaning in word.split("^"):
                allowables.append(mapping[meaning])
            sentence_as_ids.append(allowables)
        sentences_as_ids.append(sentence_as_ids)

    return sentences_as_ids
    def model_function(features, labels, mode, params):
        input_pipe = LmInputDataPipeline(glove)
        vocab_size = glove.vocab_size()
        embedding_size = input_pipe._vocab_generalized.vector_size()
        id_to_embeding_fn = input_pipe.get_id_to_embedding_mapping(
        ) if mode == tf.estimator.ModeKeys.PREDICT else lambda x: tf.zeros(
            (tf.shape(x), embedding_size), tf.float32)
        #with tf.device(device_assignment_function) if hparams.size_based_device_assignment else without:
        with tf.device("/device:CPU:0"):
            concrete_model_fn = get_autoregressor_model_fn(
                vocab_size,
                id_to_embeding_fn,
                time_major_optimization=True,
                predict_as_pure_lm=False,
                mask_allowables=input_sentence,
                hparams=hparams)
            estimator_spec = concrete_model_fn(features, labels, mode, params)
        training_hooks = []

        to_restore = tf.contrib.framework.get_variables_to_restore()
        predictions = estimator_spec.predictions
        if mode == tf.estimator.ModeKeys.PREDICT:
            training_hooks.append(InitializeVocabularyHook(glove))

            predicted_ids = tf.cast(predictions["paths"], dtype=tf.int64)
            words_shape = tf.shape(predicted_ids)
            to_vocab_id = input_pipe._vocab_generalized.generalized_id_to_vocab_id(
            )
            to_word = glove.id_to_word_op()
            predicted_ids = tf.reshape(predicted_ids, shape=[-1])
            predicted_words = to_word(to_vocab_id(predicted_ids))
            predicted_words = tf.reshape(predicted_words, shape=words_shape)
            predictions["predicted_words"] = predicted_words
        if hparams.profiler:
            training_hooks.append(
                tf.train.ProfilerHook(output_dir=model_dir,
                                      save_secs=30,
                                      show_memory=True))
            training_hooks.append(FullLogHook())
        estimator_spec_with_hooks = tf.estimator.EstimatorSpec(
            mode=estimator_spec.mode,
            loss=estimator_spec.loss,
            train_op=estimator_spec.train_op,
            eval_metric_ops=estimator_spec.eval_metric_ops,
            predictions=estimator_spec.predictions,
            training_hooks=training_hooks)
        return estimator_spec_with_hooks
 def model_function(features, labels, mode, params):
     input_pipe = LmInputDataPipeline(glove)
     vocab_size = glove.vocab_size()
     id_to_embeding_fn = input_pipe.get_id_to_embedding_mapping()
     with tf.device(device_assignment_function):
         concrete_model_fn = get_autoregressor_model_fn(
             vocab_size, id_to_embeding_fn)
         estimator_spec = concrete_model_fn(features, labels, mode, params)
     training_hooks = [InitializeVocabularyHook(glove)]
     estimator_spec_with_hooks = tf.estimator.EstimatorSpec(
         mode=estimator_spec.mode,
         loss=estimator_spec.loss,
         train_op=estimator_spec.train_op,
         eval_metric_ops=estimator_spec.eval_metric_ops,
         predictions=estimator_spec.predictions,
         training_hooks=training_hooks)
     return estimator_spec_with_hooks
 def model_function(features, labels, mode, params):
     input_pipe = LmInputDataPipeline(glove)
     vocab_size = glove.vocab_size()
     embedding_size = input_pipe._vocab_generalized.vector_size()
     id_to_embeding_fn = input_pipe.get_id_to_embedding_mapping(
     ) if mode == tf.estimator.ModeKeys.PREDICT else lambda x: tf.zeros(
         (tf.shape(x), embedding_size), tf.float32)
     with tf.device(device_assignment_function
                    ) if hparams.size_based_device_assignment else without:
         concrete_model_fn = get_autoregressor_model_fn(
             vocab_size,
             id_to_embeding_fn,
             time_major_optimization=True,
             hparams=hparams)
         estimator_spec = concrete_model_fn(features, labels, mode, params)
     if hparams.write_target_text_to_summary:
         words_shape = tf.shape(labels["targets"])
         to_vocab_id = input_pipe._vocab_generalized.generalized_id_to_vocab_id(
         )
         to_word = glove.id_to_word_op()
         flat_targets = tf.reshape(labels["targets"], shape=[-1])
         flat_targets_words = to_word(to_vocab_id(flat_targets))
         targets_words = tf.reshape(flat_targets_words, shape=words_shape)
         tf.summary.text("targets_words", targets_words)
     training_hooks = []
     if mode == tf.estimator.ModeKeys.PREDICT:
         training_hooks.append(InitializeVocabularyHook(glove))
     if hparams.profiler:
         training_hooks.append(
             tf.train.ProfilerHook(output_dir=model_dir,
                                   save_secs=30,
                                   show_memory=True))
         training_hooks.append(FullLogHook())
     estimator_spec_with_hooks = tf.estimator.EstimatorSpec(
         mode=estimator_spec.mode,
         loss=estimator_spec.loss,
         train_op=estimator_spec.train_op,
         eval_metric_ops=estimator_spec.eval_metric_ops,
         predictions=estimator_spec.predictions,
         training_hooks=training_hooks)
     return estimator_spec_with_hooks
 def data_gen():
     yield ({
         "inputs":
         np.array([[
             LmInputDataPipeline(
                 glove, None)._vocab_generalized.get_special_unit_id(
                     SpecialUnit.START_OF_SEQUENCE)
         ]],
                  dtype=np.int32),
         "length":
         len(input_sentence)
     }, np.array([0]))
 def input_fn():
     vocab = MockVocab()
     input_pipeline = LmInputDataPipeline(vocab, batch_size=3)
     input_dataset = tf.data.Dataset.from_generator(input_generator,
                                                    output_types=tf.string)
     return input_pipeline.load_data(input_dataset).repeat()
 def create_input():
     simple_examples = SimpleExamplesCorpus()
     train_data = simple_examples.get_tokens_dataset(
         DatasetType.TRAIN).repeat().shuffle(1000, seed=0)
     input_pipe = LmInputDataPipeline(glove, 8)
     return input_pipe.load_data(train_data)
 def create_input():
     simple_examples = SimpleExamplesCorpus()
     train_data = simple_examples.get_tokens_dataset(DatasetType.TRAIN)
     input_pipe = LmInputDataPipeline(glove, None)
     return input_pipe.load_data(train_data)
def prepare_training_dataset(ouput_path):
    """This will transform input corpus into language model training examples with embeddings vectors as inputs and save it to disk.
    Expect HUGE dataset in terms of occupied space."""
    if TEST_SERIALIZATION:
        test_examples = []
    ouput_path = Path(ouput_path)
    glove = Glove300()

    def create_input():
        simple_examples = SimpleExamplesCorpus()
        train_data = simple_examples.get_tokens_dataset(DatasetType.TRAIN)
        input_pipe = LmInputDataPipeline(glove, None)
        return input_pipe.load_data(train_data)

    dataset = create_input()

    def make_tf_record_example(features, labels) -> tf.train.SequenceExample:
        feature_inputs = tf.train.Feature(float_list=tf.train.FloatList(
            value=features["inputs"].reshape(-1)))
        feature_length = tf.train.Feature(int64_list=tf.train.Int64List(
            value=[features["length"]]))
        feature_targets = tf.train.Feature(int64_list=tf.train.Int64List(
            value=labels["targets"]))
        feature_dict = {
            "inputs": feature_inputs,
            "length": feature_length,
            "targets": feature_targets
        }
        features = tf.train.Features(feature=feature_dict)
        example = tf.train.Example(features=features)
        return example

    def max_length_condition(max_length):
        def check_length(features, labels):
            return tf.less_equal(features["length"], max_length)

        return check_length

    dataset = dataset.filter(max_length_condition(40))

    it = dataset.make_initializable_iterator()
    next = it.get_next()

    EXAMPLES_PER_FILE = 2000

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        glove.initialize_embeddings_in_graph(tf.get_default_graph(), sess)
        sess.run(it.initializer)
        for i in count(1):
            dataset_filename = str(ouput_path /
                                   "train.{:0=10}.tfrecords".format(i))
            writer = tf.python_io.TFRecordWriter(dataset_filename)
            try:
                for _ in range(EXAMPLES_PER_FILE):
                    features, labels = sess.run(next)
                    if TEST_SERIALIZATION:
                        test_examples.append((features, labels))
                    example = make_tf_record_example(features, labels)
                    writer.write(example.SerializeToString())
            except tf.errors.OutOfRangeError:
                break
            writer.close()

    if TEST_SERIALIZATION:
        embedding_size = LmInputDataPipeline(
            glove, None)._vocab_generalized.vector_size()
        records_dataset = read_dataset_from_files(
            [dataset_filename], embedding_size=embedding_size)
        it = records_dataset.make_initializable_iterator()
        next_record = it.get_next()
        with tf.Session() as sess:
            sess.run(it.initializer)
            for expected_features, expected_labels in test_examples:
                actual_features, actual_labels = sess.run(next_record)
                assert (actual_features["inputs"] ==
                        expected_features["inputs"]).all()
                assert (actual_features["length"] ==
                        expected_features["length"]).all()
                assert actual_labels["targets"] == approx(
                    expected_labels["targets"])