def romanize_search_query(word): """ This method romanizes all languages for search query. If word is hiragana or katakana, it is romanized by jautils. Args: word: should be script varianted Returns: [romanized_word, ... ] (if word can be romanized by unidecode and jp_dictionary, returns multiple romanizations.) """ if not word: return [] romanized_words = [] if has_kanji(word): romanized_words = romanize_japanese_word(word, for_index=False) if jautils.should_normalize(word): hiragana_word = jautils.normalize(word) romanized_words.append(jautils.hiragana_to_romaji(hiragana_word)) # if the name is a Chinese name and the chinese_romanize can produce # a different result, append the result to the romanzied_words with # unidecode results together unidecode_romanize_word = unidecode(word).strip() chinese_romanize_list = romanize_chinese_name(word) chinese_romanize_word = chinese_romanize_list[ 0] if chinese_romanize_list else '' if chinese_romanize_word and chinese_romanize_word != unidecode_romanize_word: romanized_words.append(chinese_romanize_word) romanized_words.append(unidecode_romanize_word) return romanized_words
def romanize_search_query(word): """ This method romanizes all languages for search query. If word is hiragana or katakana, it is romanized by jautils. Args: word: should be script varianted Returns: [romanized_word, ... ] (if word can be romanized by unidecode and jp_dictionary, returns multiple romanizations.) """ if not word: return [] romanized_words = [] if has_kanji(word): romanized_words = romanize_japanese_word(word, for_index=False) if jautils.should_normalize(word): hiragana_word = jautils.normalize(word) romanized_words.append(jautils.hiragana_to_romaji(hiragana_word)) romanized_word = unidecode(word) romanized_words.append(romanized_word.strip()) return romanized_words
def romanize_word_by_unidecode(word): """ This method romanizes all languages by unidecode. If word is hiragana or katakana, it is romanized by jautils. kanji is romanized in Chinese way. Args: word: should be script varianted Returns: an array of romanzied_word by unidecode [romanized_word] """ if not word: return [''] if jautils.should_normalize(word): hiragana_word = jautils.normalize(word) return [jautils.hiragana_to_romaji(hiragana_word)] romanized_word = unidecode(word) return [romanized_word.strip()]
def __init__(self, query, unicode_word=False): self.query = query if unicode_word: query = unicode(query or '') # Do we need a Japanese specific logic to normalize the query? if jautils.should_normalize(query): self.normalized = jautils.normalize(query) else: self.normalized = normalize(query) # Split out each CJK ideograph as its own word. # The main CJK ideograph range is from U+4E00 to U+9FFF. # CJK Extension A is from U+3400 to U+4DFF. cjk_separated = re.sub(ur'([\u3400-\u9fff])', r' \1 ', self.normalized) # Separate the query into words. self.words = cjk_separated.split() # query_words is redundant now but I'm leaving it since I don't want to # change the signature of TextQuery yet # TODO(ryok): get rid of this field? self.query_words = self.words
def __init__(self, query): self.query = query query = unicode(query or '') # Do we need a Japanese specific logic to normalize the query? if jautils.should_normalize(query): self.normalized = jautils.normalize(query) else: self.normalized = normalize(query) # Split out each CJK ideograph as its own word. # The main CJK ideograph range is from U+4E00 to U+9FFF. # CJK Extension A is from U+3400 to U+4DFF. cjk_separated = re.sub(ur'([\u3400-\u9fff])', r' \1 ', self.normalized) # Separate the query into words. self.words = cjk_separated.split() # query_words is redundant now but I'm leaving it since I don't want to # change the signature of TextQuery yet # TODO(ryok): get rid of this field? self.query_words = self.words
def test_normalize(self): assert jautils.normalize(u'abc') == u'ABC' assert jautils.normalize(u' ABC ') == u'ABC' assert jautils.normalize(u'ABC 012') == u'ABC' assert jautils.normalize(u'漢字') == u'漢字' assert jautils.normalize(u'ひらがな') == u'ひらがな' assert jautils.normalize(u'カタカナ') == u'かたかな' assert jautils.normalize(u'カタカナ') == u'かたかな' assert jautils.normalize(u'abc') == u'ABC' assert jautils.normalize(u' ABC ') == u'ABC' assert jautils.normalize(u'ひらがな カタカナ') == u'ひらがな かたかな' assert jautils.normalize(u'キミヱ') == u'きみえ' assert jautils.normalize(u"(abc) O'Hearn") == u'ABC OHEARN'
This method romanizes all languages by unidecode. If word is hiragana or katakana, it is romanized by jautils. Args: word: should be script varianted Returns: script varianted word """ if not word: return word if re.match(ur'([\u3400-\u9fff])', word): word = romanize_japanese_name_by_name_dict(word) word = romanize_japanese_location(word) if jautils.should_normalize(word): hiragana_word = jautils.normalize(word) return jautils.hiragana_to_romaji(hiragana_word) romanized_word = unidecode(word) return romanized_word.strip() def romanize_text(query_txt): """ Applies romanization to each word in query_txt. This method uses unidecode and jautils for script variant. Args: query_txt: Search query Returns: script varianted query_txt (except kanji) """ query_words = query_txt.split(' ')