def test_wrong_alg(self):
     if tf.executing_eagerly():
         with self.assertRaisesRegexp(
                 tf.errors.InvalidArgumentError,
                 'is not in the list of allowed values'):
             self.evaluate(normalize_unicode(u'', 'ABCD'))
     else:
         with self.assertRaisesRegexp(ValueError, 'string \'ABCD\' not in'):
             self.evaluate(normalize_unicode(u'', 'ABCD'))
    def test_ragged(self):
        source = tf.ragged.constant([['X', 'Y'], ['Z']])
        expected = tf.constant([['X', 'Y'], ['Z', '']])
        result = normalize_unicode(source, 'NFD').to_tensor(default_value='')

        expected, result = self.evaluate([expected, result])
        self.assertAllEqual(expected, result)
    def test_inference_shape(self):
        source = [
            ['1', '2', '3'],
            ['4', '5', '6'],
        ]
        result = normalize_unicode(source, 'NFD')

        self.assertAllEqual([2, 3], result.shape.as_list())
    def test_skip(self):
        expected = tf.convert_to_tensor([['X', u'\u1E9B\u0323', u'\u0451']],
                                        dtype=tf.string)
        result = normalize_unicode([['X', u'\u1E9B\u0323', u'\u0435\u0308']],
                                   'NFKC',
                                   skip=[u'\u1E9B\u0323'])

        expected, result = self.evaluate([expected, result])
        self.assertAllEqual(expected, result)
    def test_actual_shape(self):
        source = [
            ['1', '2', '3'],
            ['4', '5', '6'],
        ]
        result = normalize_unicode(source, 'NFD')
        result = tf.shape(result)

        result = self.evaluate(result)
        self.assertAllEqual([2, 3], result.tolist())
예제 #6
0
def ngram_features(input_words, minn, maxn):
    input_words = normalize_unicode(input_words, 'NFKC')
    input_words = replace_string(  # accentuation
        input_words,
        [u'\u0060', u' \u0301', u'\u02CA', u'\u02CB', u'\u0300', u'\u0301'],
        [''] * 6)
    input_words = lower_case(input_words)
    input_words = zero_digits(input_words)
    input_words = wrap_with(input_words, '<', '>')
    word_ngrams = char_ngrams(input_words, minn, maxn, itself='ALONE')

    return word_ngrams
예제 #7
0
def case_features(input_words):
    input_words = normalize_unicode(input_words, 'NFKC')
    words_lower = lower_case(input_words)
    words_upper = upper_case(input_words)
    words_title = title_case(input_words)

    has_case = tf.not_equal(words_lower, words_upper)
    no_case = tf.logical_not(has_case)

    is_lower = tf.logical_and(has_case, tf.equal(input_words, words_lower))

    is_upper = tf.logical_and(has_case, tf.equal(input_words, words_upper))

    is_title = tf.logical_and(has_case, tf.equal(input_words, words_title))

    is_mixed = tf.logical_not(
        tf.logical_or(tf.logical_or(no_case, is_lower),
                      tf.logical_or(is_upper, is_title)))

    return tf.cast(no_case, tf.int32), \
           tf.cast(is_lower, tf.int32), \
           tf.cast(is_upper, tf.int32), \
           tf.cast(is_title, tf.int32), \
           tf.cast(is_mixed, tf.int32)
    def test_n_f_k_c(self):
        expected = tf.convert_to_tensor(u'\u1E69', dtype=tf.string)
        result = normalize_unicode(u'\u1E9B\u0323', 'NFKC')

        expected, result = self.evaluate([expected, result])
        self.assertAllEqual(expected, result)
    def test_n_f_k_d(self):
        expected = tf.convert_to_tensor(u'\u0031', dtype=tf.string)
        result = normalize_unicode(u'\u2460', 'NFKD')

        expected, result = self.evaluate([expected, result])
        self.assertAllEqual(expected, result)
예제 #10
0
    def test_n_f_c(self):
        expected = tf.convert_to_tensor(u'\u00C5', dtype=tf.string)
        result = normalize_unicode(u'\u0041\u030A', 'NFC')

        expected, result = self.evaluate([expected, result])
        self.assertAllEqual(expected, result)
예제 #11
0
    def test_2d(self):
        result = normalize_unicode([['X']], 'NFD')

        result = self.evaluate(result)
        self.assertAllEqual([[b'X']], result)
예제 #12
0
    def test_0d(self):
        result = normalize_unicode('X', 'NFD')

        result = self.evaluate(result)
        self.assertAllEqual(b'X', result)