示例#1
0
 def test_should_normalize(self):
     assert jautils.should_normalize(u'abc') == False
     assert jautils.should_normalize(u' ABC ') == False
     assert jautils.should_normalize(u'ABC 012') == False
     assert jautils.should_normalize(u'漢字') == False
     assert jautils.should_normalize(u'ひらがな') == True
     assert jautils.should_normalize(u'カタカナ') == True
     assert jautils.should_normalize(u'カタカナ') == True
     assert jautils.should_normalize(u'abc') == True
     assert jautils.should_normalize(u' ABC ') == True
     assert jautils.should_normalize(u'ひらがな カタカナ') == True
示例#2
0
 def test_should_normalize(self):
     assert jautils.should_normalize(u'abc') == False
     assert jautils.should_normalize(u' ABC ') == False
     assert jautils.should_normalize(u'ABC 012') == False
     assert jautils.should_normalize(u'漢字') == False
     assert jautils.should_normalize(u'ひらがな') == True
     assert jautils.should_normalize(u'カタカナ') == True
     assert jautils.should_normalize(u'カタカナ') == True
     assert jautils.should_normalize(u'abc') == True
     assert jautils.should_normalize(u' ABC ') == True
     assert jautils.should_normalize(u'ひらがな カタカナ') == True
示例#3
0
def romanize_search_query(word):
    """
    This method romanizes all languages for search query.
    If word is hiragana or katakana, it is romanized by jautils.
    Args:
        word: should be script varianted
    Returns:
        [romanized_word, ... ]   
        (if word can be romanized by unidecode and jp_dictionary,
        returns multiple romanizations.)
    """
    if not word:
        return []

    romanized_words = []
    if has_kanji(word):
        romanized_words = romanize_japanese_word(word, for_index=False)

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        romanized_words.append(jautils.hiragana_to_romaji(hiragana_word))

    romanized_word = unidecode(word)
    romanized_words.append(romanized_word.strip())
    return romanized_words
def romanize_search_query(word):
    """
    This method romanizes all languages for search query.
    If word is hiragana or katakana, it is romanized by jautils.
    Args:
        word: should be script varianted
    Returns:
        [romanized_word, ... ]   
        (if word can be romanized by unidecode and jp_dictionary,
        returns multiple romanizations.)
    """
    if not word:
        return []

    romanized_words = []
    if has_kanji(word):
        romanized_words = romanize_japanese_word(word, for_index=False)

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        romanized_words.append(jautils.hiragana_to_romaji(hiragana_word))

    romanized_word = unidecode(word)
    romanized_words.append(romanized_word.strip())
    return romanized_words
示例#5
0
def romanize_search_query(word):
    """
    This method romanizes all languages for search query.
    If word is hiragana or katakana, it is romanized by jautils.
    Args:
        word: should be script varianted
    Returns:
        [romanized_word, ... ]   
        (if word can be romanized by unidecode and jp_dictionary,
        returns multiple romanizations.)
    """
    if not word:
        return []

    romanized_words = []
    if has_kanji(word):
        romanized_words = romanize_japanese_word(word, for_index=False)

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        romanized_words.append(jautils.hiragana_to_romaji(hiragana_word))

    # if the name is a Chinese name and the chinese_romanize can produce
    # a different result, append the result to the romanzied_words with
    # unidecode results together
    unidecode_romanize_word = unidecode(word).strip()
    chinese_romanize_list = romanize_chinese_name(word)
    chinese_romanize_word = chinese_romanize_list[
        0] if chinese_romanize_list else ''
    if chinese_romanize_word and chinese_romanize_word != unidecode_romanize_word:
        romanized_words.append(chinese_romanize_word)
    romanized_words.append(unidecode_romanize_word)

    return romanized_words
示例#6
0
def romanize_word_by_unidecode(word):
    """
    This method romanizes all languages by unidecode.
    If word is hiragana or katakana, it is romanized by jautils.
    kanji is romanized in Chinese way.
    Args:
        word: should be script varianted
    Returns:
        an array of romanzied_word by unidecode [romanized_word]
    """
    if not word:
        return ['']

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        return [jautils.hiragana_to_romaji(hiragana_word)]
    romanized_word = unidecode(word)
    return [romanized_word.strip()]
def romanize_word_by_unidecode(word):
    """
    This method romanizes all languages by unidecode.
    If word is hiragana or katakana, it is romanized by jautils.
    kanji is romanized in Chinese way.
    Args:
        word: should be script varianted
    Returns:
        an array of romanzied_word by unidecode [romanized_word]
    """
    if not word:
        return ['']

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        return [jautils.hiragana_to_romaji(hiragana_word)]
    romanized_word = unidecode(word)
    return [romanized_word.strip()]
示例#8
0
    def __init__(self, query, unicode_word=False):
        self.query = query
        if unicode_word:
            query = unicode(query or '')
        # Do we need a Japanese specific logic to normalize the query?
        if jautils.should_normalize(query):
            self.normalized = jautils.normalize(query)
        else:
            self.normalized = normalize(query)

        # Split out each CJK ideograph as its own word.
        # The main CJK ideograph range is from U+4E00 to U+9FFF.
        # CJK Extension A is from U+3400 to U+4DFF.
        cjk_separated = re.sub(ur'([\u3400-\u9fff])', r' \1 ', self.normalized)

        # Separate the query into words.
        self.words = cjk_separated.split()

        # query_words is redundant now but I'm leaving it since I don't want to
        # change the signature of TextQuery yet
        # TODO(ryok): get rid of this field?
        self.query_words = self.words
示例#9
0
    def __init__(self, query):
        self.query = query

        query = unicode(query or '')
        # Do we need a Japanese specific logic to normalize the query?
        if jautils.should_normalize(query):
            self.normalized = jautils.normalize(query)
        else:
            self.normalized = normalize(query)

        # Split out each CJK ideograph as its own word.
        # The main CJK ideograph range is from U+4E00 to U+9FFF.
        # CJK Extension A is from U+3400 to U+4DFF.
        cjk_separated = re.sub(ur'([\u3400-\u9fff])', r' \1 ', self.normalized)

        # Separate the query into words.
        self.words = cjk_separated.split()

        # query_words is redundant now but I'm leaving it since I don't want to
        # change the signature of TextQuery yet
        # TODO(ryok): get rid of this field?
        self.query_words = self.words
示例#10
0
    """
    This method romanizes all languages by unidecode.
    If word is hiragana or katakana, it is romanized by jautils.
    Args:
        word: should be script varianted
    Returns:
        script varianted word
    """
    if not word:
        return word

    if re.match(ur'([\u3400-\u9fff])', word):
        word = romanize_japanese_name_by_name_dict(word)
        word = romanize_japanese_location(word)

    if jautils.should_normalize(word):
        hiragana_word = jautils.normalize(word)
        return jautils.hiragana_to_romaji(hiragana_word)
    romanized_word = unidecode(word)
    return romanized_word.strip()


def romanize_text(query_txt):
    """
    Applies romanization to each word in query_txt.
    This method uses unidecode and jautils for script variant.
    Args:
        query_txt: Search query
    Returns:
        script varianted query_txt (except kanji)
    """