def testNestedParallelInputterShareParameters(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_config = { "1_1_vocabulary": vocab_file, "1_2_vocabulary": vocab_file, "2_vocabulary": vocab_file, } source_inputters = [ text_inputter.WordEmbedder(embedding_size=10), text_inputter.WordEmbedder(embedding_size=10), ] target_inputter = text_inputter.WordEmbedder(embedding_size=10) inputters = [ inputter.ParallelInputter(source_inputters, share_parameters=True), target_inputter, ] parallel_inputter = inputter.ParallelInputter(inputters, share_parameters=True) parallel_inputter.initialize(data_config) parallel_inputter.build(None) self.assertEqual(source_inputters[0].embedding.ref(), target_inputter.embedding.ref()) self.assertEqual(source_inputters[1].embedding.ref(), target_inputter.embedding.ref())
def testNestedInputtersWithFlatDataFiles(self): inputters = inputter.ParallelInputter( [ record_inputter.SequenceRecordInputter(10), record_inputter.SequenceRecordInputter(10), ], reducer=reducer.SumReducer(), ) inputters = inputter.ParallelInputter( [ record_inputter.SequenceRecordInputter(10), inputters, ], reducer=reducer.ConcatReducer(), ) self.assertListEqual(inputters._structure(), [None, [None, None]]) empty_file = os.path.join(self.get_temp_dir(), "test.txt") with open(empty_file, "w"): pass with self.assertRaises(ValueError): inputters.make_inference_dataset([empty_file, empty_file], batch_size=2) inputters.make_inference_dataset([empty_file, empty_file, empty_file], batch_size=2)
def testNestedParallelInputterShareParameters(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) metadata = {"vocabulary_file": vocab_file} source_inputters = [ text_inputter.WordEmbedder("vocabulary_file", embedding_size=10), text_inputter.WordEmbedder("vocabulary_file", embedding_size=10)] target_inputter = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10) inputters = [ inputter.ParallelInputter(source_inputters, share_parameters=True), target_inputter] parallel_inputter = inputter.ParallelInputter(inputters, share_parameters=True) parallel_inputter.initialize(metadata) parallel_inputter.build() self.assertEqual(source_inputters[0].embedding, target_inputter.embedding) self.assertEqual(source_inputters[1].embedding, target_inputter.embedding)
def testBatchAutotuneDatasetMultiSource(self): vocab_file = self._makeTextFile("vocab.txt", ["1", "2", "3", "4"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) source_inputter = inputter.ParallelInputter([ text_inputter.WordEmbedder(embedding_size=10), text_inputter.WordEmbedder(embedding_size=10), ]) target_inputter = text_inputter.WordEmbedder(embedding_size=10) target_inputter.set_decoder_mode(mark_start=True, mark_end=True) example_inputter = inputter.ExampleInputter(source_inputter, target_inputter) example_inputter.initialize({ "source_1_vocabulary": vocab_file, "source_2_vocabulary": vocab_file, "target_vocabulary": vocab_file, }) dataset = example_inputter.make_training_dataset( [data_file, data_file], data_file, batch_size=1024, batch_type="tokens", maximum_features_length=[100, 110], maximum_labels_length=120, batch_autotune_mode=True, ) source, target = next(iter(dataset)) self.assertListEqual(source["inputter_0_ids"].shape.as_list(), [8, 100]) self.assertListEqual(source["inputter_1_ids"].shape.as_list(), [8, 110]) self.assertListEqual(target["ids"].shape.as_list(), [8, 120]) self.assertListEqual(target["ids_out"].shape.as_list(), [8, 120])
def testParallelInputter(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) data_files = [data_file, data_file] parallel_inputter = inputter.ParallelInputter([ text_inputter.WordEmbedder(embedding_size=10), text_inputter.WordEmbedder(embedding_size=5), ]) self.assertEqual(parallel_inputter.num_outputs, 2) features, transformed = self._makeDataset( parallel_inputter, data_files, data_config={ "1_vocabulary": vocab_file, "2_vocabulary": vocab_file }, shapes={ "inputter_0_ids": [None, None], "inputter_0_length": [None], "inputter_1_ids": [None, None], "inputter_1_length": [None], }, ) self.assertEqual(2, len(parallel_inputter.get_length(features))) self.assertEqual(2, len(transformed)) self.assertAllEqual([1, 3, 10], transformed[0].shape) self.assertAllEqual([1, 3, 5], transformed[1].shape)
def testParallelInputter(self): with open(vocab_file, "w") as vocab: vocab.write("the\n" "world\n" "hello\n" "toto\n") with open(data_file, "w") as data: data.write("hello world !\n") parallel_inputter = inputter.ParallelInputter([ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5) ]) data, transformed = _first_element(parallel_inputter, [data_file, data_file], { "vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file }) self.assertEqual(2, len(parallel_inputter.get_length(data))) input_receiver = parallel_inputter.get_serving_input_receiver() self.assertIn("inputter_0_ids", input_receiver.features) self.assertIn("inputter_1_ids", input_receiver.features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) data, transformed = sess.run([data, transformed]) self.assertIn("inputter_0_ids", data) self.assertIn("inputter_1_ids", data) self.assertEqual(2, len(transformed)) self.assertAllEqual([1, 3, 10], transformed[0].shape) self.assertAllEqual([1, 3, 5], transformed[1].shape)
def testParallelInputter(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) data_files = [data_file, data_file] parallel_inputter = inputter.ParallelInputter([ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5)]) self.assertEqual(parallel_inputter.num_outputs, 2) features, transformed = self._makeDataset( parallel_inputter, data_files, metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file}, shapes={"inputter_0_ids": [None, None], "inputter_0_length": [None], "inputter_1_ids": [None, None], "inputter_1_length": [None]}) self.assertEqual(2, len(parallel_inputter.get_length(features))) self.assertNotIn("inputter_0_raw", features) self.assertNotIn("inputter_1_raw", features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertEqual(2, len(transformed)) self.assertAllEqual([1, 3, 10], transformed[0].shape) self.assertAllEqual([1, 3, 5], transformed[1].shape)
def testParallelInputterShareParameters(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_config = {"1_vocabulary": vocab_file, "2_vocabulary": vocab_file} inputters = [ text_inputter.WordEmbedder(embedding_size=10), text_inputter.WordEmbedder(embedding_size=10) ] parallel_inputter = inputter.ParallelInputter(inputters, share_parameters=True) parallel_inputter.initialize(data_config) parallel_inputter.build(None) self.assertEqual(inputters[0].embedding.experimental_ref(), inputters[1].embedding.experimental_ref())
def testParallelInputter(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") with open(vocab_file, "w") as vocab: vocab.write("the\n" "world\n" "hello\n" "toto\n") with open(data_file, "w") as data: data.write("hello world !\n") data_files = [data_file, data_file] parallel_inputter = inputter.ParallelInputter([ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5) ]) features, transformed = self._makeDataset(parallel_inputter, data_files, metadata={ "vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file }, shapes={ "inputter_0_ids": [None, None], "inputter_0_length": [None], "inputter_1_ids": [None, None], "inputter_1_length": [None] }) self.assertEqual(2, len(parallel_inputter.get_length(features))) self.assertNotIn("inputter_0_raw", features) self.assertNotIn("inputter_0_tokens", features) self.assertNotIn("inputter_1_raw", features) self.assertNotIn("inputter_1_tokens", features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertEqual(2, len(transformed)) self.assertAllEqual([1, 3, 10], transformed[0].shape) self.assertAllEqual([1, 3, 5], transformed[1].shape)
def testParallelInputterSplitFeatures(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) source_embedder = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10) target_embedder = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10) target_embedder.is_target = True parallel_inputter = inputter.ParallelInputter( [source_embedder, target_embedder], combine_features=False) self.assertEqual(parallel_inputter.num_outputs, 2) features, transformed = self._makeDataset( parallel_inputter, [data_file, data_file], metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file}) self.assertIsInstance(features, tuple) self.assertEqual(len(features), 2) self.assertEqual(len(transformed), 2) features, labels = features for field in ("ids", "length", "tokens"): self.assertIn(field, features) for field in ("ids", "ids_out", "length", "tokens"): self.assertIn(field, labels)