def testWordEmbedderWithNoise(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) noiser = noise.WordNoiser(noises=[noise.WordOmission(1)]) embedder = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10) embedder.is_target = True embedder.set_noise(noiser, in_place=False) expected_shapes = { "tokens": [None, None], "ids": [None, None], "ids_out": [None, None], "length": [None], "noisy_tokens": [None, None], "noisy_ids": [None, None], "noisy_ids_out": [None, None], "noisy_length": [None] } features, transformed = self._makeDataset( embedder, data_file, metadata={"vocabulary_file": vocab_file}, shapes=expected_shapes) self.assertEqual(features["noisy_length"][0], features["length"][0] - 1)
def initialize(self, metadata, params=None): super(SequenceToSequence, self).initialize(metadata, params=params) if params and params.get("contrastive_learning"): subword_token = params.get("decoding_subword_token", "■") # Use the simplest and most effective CL_one from the paper. # https://www.aclweb.org/anthology/P19-1623 noiser = noise.WordNoiser( noises=[noise.WordOmission(1)], subword_token=subword_token, is_spacer=subword_token == "▁") self.labels_inputter.set_noise(noiser, in_place=False)
def testWordNoising(self): tokens = tf.constant([["a■", "b", "c■", "d", "■e"], ["a", "b", "c", "", ""]]) lengths = tf.constant([5, 3]) noiser = noise.WordNoiser() noiser.add(noise.WordDropout(0.1)) noiser.add(noise.WordReplacement(0.1)) noiser.add(noise.WordPermutation(3)) noisy_tokens, noisy_lengths = noiser(tokens, sequence_length=lengths) tokens, noisy_tokens = self.evaluate([tokens, noisy_tokens]) self.assertAllEqual(noisy_tokens.shape, tokens.shape)
def testWordEmbedderWithInPlaceNoise(self, probability): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) noiser = noise.WordNoiser(noises=[noise.WordOmission(1)]) embedder = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10) embedder.set_noise(noiser, probability=probability) features, transformed = self._makeDataset( embedder, data_file, metadata={"vocabulary_file": vocab_file}, shapes={ "tokens": [None, None], "ids": [None, None], "length": [None] }) self.assertEqual(features["length"][0], 3 if probability == 0 else 2)
def _add_noise(tokens, lengths, params, subword_token): if not isinstance(params, list): raise ValueError("Expected a list of noise modules") noises = [] for module in params: noise_type, args = six.next(six.iteritems(module)) if not isinstance(args, list): args = [args] noise_type = noise_type.lower() if noise_type == "dropout": noise_class = noise.WordDropout elif noise_type == "replacement": noise_class = noise.WordReplacement elif noise_type == "permutation": noise_class = noise.WordPermutation else: raise ValueError("Invalid noise type: %s" % noise_type) noises.append(noise_class(*args)) noiser = noise.WordNoiser(noises=noises, subword_token=subword_token, is_spacer=subword_token == "▁") return noiser(tokens, lengths, keep_shape=True)