def test_wrong_alg(self): if tf.executing_eagerly(): with self.assertRaisesRegexp( tf.errors.InvalidArgumentError, 'is not in the list of allowed values'): self.evaluate(normalize_unicode(u'', 'ABCD')) else: with self.assertRaisesRegexp(ValueError, 'string \'ABCD\' not in'): self.evaluate(normalize_unicode(u'', 'ABCD'))
def test_ragged(self): source = tf.ragged.constant([['X', 'Y'], ['Z']]) expected = tf.constant([['X', 'Y'], ['Z', '']]) result = normalize_unicode(source, 'NFD').to_tensor(default_value='') expected, result = self.evaluate([expected, result]) self.assertAllEqual(expected, result)
def test_inference_shape(self): source = [ ['1', '2', '3'], ['4', '5', '6'], ] result = normalize_unicode(source, 'NFD') self.assertAllEqual([2, 3], result.shape.as_list())
def test_skip(self): expected = tf.convert_to_tensor([['X', u'\u1E9B\u0323', u'\u0451']], dtype=tf.string) result = normalize_unicode([['X', u'\u1E9B\u0323', u'\u0435\u0308']], 'NFKC', skip=[u'\u1E9B\u0323']) expected, result = self.evaluate([expected, result]) self.assertAllEqual(expected, result)
def test_actual_shape(self): source = [ ['1', '2', '3'], ['4', '5', '6'], ] result = normalize_unicode(source, 'NFD') result = tf.shape(result) result = self.evaluate(result) self.assertAllEqual([2, 3], result.tolist())
def ngram_features(input_words, minn, maxn): input_words = normalize_unicode(input_words, 'NFKC') input_words = replace_string( # accentuation input_words, [u'\u0060', u' \u0301', u'\u02CA', u'\u02CB', u'\u0300', u'\u0301'], [''] * 6) input_words = lower_case(input_words) input_words = zero_digits(input_words) input_words = wrap_with(input_words, '<', '>') word_ngrams = char_ngrams(input_words, minn, maxn, itself='ALONE') return word_ngrams
def case_features(input_words): input_words = normalize_unicode(input_words, 'NFKC') words_lower = lower_case(input_words) words_upper = upper_case(input_words) words_title = title_case(input_words) has_case = tf.not_equal(words_lower, words_upper) no_case = tf.logical_not(has_case) is_lower = tf.logical_and(has_case, tf.equal(input_words, words_lower)) is_upper = tf.logical_and(has_case, tf.equal(input_words, words_upper)) is_title = tf.logical_and(has_case, tf.equal(input_words, words_title)) is_mixed = tf.logical_not( tf.logical_or(tf.logical_or(no_case, is_lower), tf.logical_or(is_upper, is_title))) return tf.cast(no_case, tf.int32), \ tf.cast(is_lower, tf.int32), \ tf.cast(is_upper, tf.int32), \ tf.cast(is_title, tf.int32), \ tf.cast(is_mixed, tf.int32)
def test_n_f_k_c(self): expected = tf.convert_to_tensor(u'\u1E69', dtype=tf.string) result = normalize_unicode(u'\u1E9B\u0323', 'NFKC') expected, result = self.evaluate([expected, result]) self.assertAllEqual(expected, result)
def test_n_f_k_d(self): expected = tf.convert_to_tensor(u'\u0031', dtype=tf.string) result = normalize_unicode(u'\u2460', 'NFKD') expected, result = self.evaluate([expected, result]) self.assertAllEqual(expected, result)
def test_n_f_c(self): expected = tf.convert_to_tensor(u'\u00C5', dtype=tf.string) result = normalize_unicode(u'\u0041\u030A', 'NFC') expected, result = self.evaluate([expected, result]) self.assertAllEqual(expected, result)
def test_2d(self): result = normalize_unicode([['X']], 'NFD') result = self.evaluate(result) self.assertAllEqual([[b'X']], result)
def test_0d(self): result = normalize_unicode('X', 'NFD') result = self.evaluate(result) self.assertAllEqual(b'X', result)