def reading_form(self) -> str:
        sudachi_reading = "".join(m.reading_form() for m in self.morphemes)

        surface = self.surface()
        if re.match(rf"{kata_re}+", surface) and not sudachi_reading:
            return surface

        sudachi_dict_reading = "".join(m.reading_form()
                                       for m in parse(self.dictionary_form()))
        surface_lForms = [
            m.feature.lForm for m in fugashi_parse(self.surface())
        ]
        dict_lForms = [
            m.feature.lForm for m in fugashi_parse(self.dictionary_form())
        ]
        fugashi_reading = "".join(surface_lForms) if all(
            surface_lForms) else ""
        fugashi_dict_reading = "".join(dict_lForms) if all(dict_lForms) else ""
        sudachi_lookup = jmdict_lookup(
            jaconv.kata2hira(sudachi_reading)).entries
        sudachi_dict_lookup = jmdict_lookup(
            jaconv.kata2hira(sudachi_dict_reading)).entries
        fugashi_dict_lookup = (fugashi_dict_reading and jmdict_lookup(
            jaconv.kata2hira(fugashi_dict_reading)).entries)

        if not (sudachi_lookup or sudachi_dict_lookup) and fugashi_dict_lookup:
            return fugashi_reading

        return sudachi_reading
Пример #2
0
def split_into_words(text):
    text = text.upper()
    import MeCab
    m = MeCab.Tagger("-Owakati")
    m1 = MeCab.Tagger()

    node = m.parseToNode(text)
    line = m1.parse(text).splitlines()
    words = []
    #tp = m.parse(text).split()
    while node:
        words.append(node.surface)
        node = node.next
    data = []

    for k in line:
        v = k.split('\t')
        #print(v)
        if len(v) > 3:
            data.append([
                v[3].split('-')[0] + ':' + jaconv.kata2hira(v[1]),
                v[4].split('-')[0]
            ])
            print([
                v[3].split('-')[0] + ':' + jaconv.kata2hira(v[1]),
                v[4].split('-')[0]
            ])
        else:
            data.append([v[0] + ':' + v[0], ''])
    #stop_words = [x.strip() for x in open('./data/stopwords.txt','r').read().split('\n')]

    return data
Пример #3
0
    def extract(self, file_name):
        with open(file_name) as sentences: # word2id,id2wordの辞書を作成
            for line in sentences:
                ids = []
                for i in self.mecab.parse(line).splitlines():
                    word = jaconv.kata2hira(i.split(',')[-1])
                    if (word not in self.word2id_dic.keys()) and (word != '*'):
                        self.word2id_dic[word] = len(self.word2id_dic)
                        self.id2word_dic[len(self.id2word_dic)] = word
            self.word2id_dic[' '] = len(self.word2id_dic)
            self.id2word_dic[len(self.id2word_dic)] = ' '

        with open(file_name) as sentences: # 各テキストの単語にidを付与し,
            for line in sentences:         # id列をonehotシークエンスに変換
                ids = []
                for i in self.mecab.parse(line).splitlines():
                    word = jaconv.kata2hira(i.split(',')[-1])
                    if word != '*':
                        ids.append(self.word2id_dic[word])
                self.text_onehot_lists.append(ids)

            self.text_max_len = max([len(i) for i in self.text_onehot_lists])
            for id, id_list in enumerate(self.text_onehot_lists):
                pad_list = [self.word2id_dic[' '] for i in range(self.text_max_len-len(id_list))]
                self.text_onehot_lists[id] += pad_list
            self.text_onehot_lists = [self.make_onehot(np.array(i)).tolist() for i in self.text_onehot_lists]

        return self.word2id_dic, self.id2word_dic, self.text_onehot_lists
Пример #4
0
 def find(cosmetic):
     if (item
             and cosmetic['type']['backendValue'] not in item.split(',')
             or cosmetic['name'] is None):
         return
     if mode == 'name':
         name = cosmetic['name']
         if self.case_insensitive:
             name = jaconv.kata2hira(cosmetic['name'].casefold())
         if self.convert_kanji:
             name = self.bot.converter.do(name)
         if text in name:
             result.append(cosmetic)
     elif mode == 'id':
         if text in (cosmetic['id'].casefold()):
             result.append(cosmetic)
     elif mode == 'set':
         if cosmetic.get('set') is None:
             return
         name = cosmetic['name']
         if self.case_insensitive:
             name = jaconv.kata2hira(name.casefold())
         if self.convert_kanji:
             name = self.bot.converter.do(name)
         if text in name:
             result.append(cosmetic)
Пример #5
0
def get_song_meta(song):
    d = {}
    search = set()
    for k, v in song.meta.items():
        if request.latin:
            d[k] = v[(request.lc, "l")]
        else:
            d[k] = v[request.lc]
    for k in ("title", "artist", "seenon", "album"):
        if k in song.meta:
            v = song.meta[k]
            search.add(normalize(v[request.lc]))
            search.add(normalize(v["k"]))
            search.add(normalize(v["l"]))
            search.add(
                normalize(jaconv.kana2alphabet(jaconv.kata2hira(
                    v["k"]))).replace("ー", ""))
    for k in ("genre", ):
        if k in song.meta:
            v = song.meta[k]
            search.add(normalize(v[request.lc]))
    d["search"] = list(search)
    if request.latin:
        d["sort"] = song.meta["title"][(request.lc, "l")]
        if ord(d["sort"][0:1]) > 0x100:
            # Try again with kana-to-romaji, might help manufacture some sensible sort order
            d["sort"] = jaconv.kana2alphabet(
                jaconv.kata2hira(song.meta["title"][(request.lc, "l", "k")]))
    else:
        d["sort"] = song.meta["title"][(request.lc, "k")]
    return d
Пример #6
0
def run(input, output):
    data = input.read()
    result = []
    for line in data.split('\n'):
        cols = line.split('\t')
        if len(cols) == 5 and cols[0].isdigit():
            # 団体コード	"都道府県名(漢字)"	"市区町村名(漢字)"	"都道府県名(カナ)"	"市区町村名(カナ)"
            result.append({
                'code':
                unicodedata.normalize('NFKC', cols[0]).strip(),
                'pref':
                unicodedata.normalize('NFKC', cols[1]).strip(),
                'city':
                unicodedata.normalize('NFKC', cols[2]).strip(),
                'pref_k':
                jaconv.h2z(cols[3]).strip(),
                'city_k':
                jaconv.h2z(cols[4]).strip(),
                'pref_h':
                jaconv.kata2hira(jaconv.h2z(cols[3])).strip(),
                'city_h':
                jaconv.kata2hira(jaconv.h2z(cols[4])).strip()
            })

    output.write(
        json.dumps(
            {
                'title': 'jp_citycode',
                'version': DATA_VERSION,
                'table': result
            },
            ensure_ascii=False).encode("utf-8"))
    click.echo('%d件処理しました' % len(result))
Пример #7
0
def qu(path_1):

    path = path_1
    for mulu in os.listdir(path):  #每一个循环读到一个大文件C064L,C064R

        jushiqi = 0
        file_dir = os.path.join(path, mulu)
        #chasen的出力文件的地址以及整理之后的文件的地址

        file_dir_2 = os.path.join(file_dir, 'keka')
        #原本的.out文件的路劲,仅仅是想要把文件名取出来

        feature = 'chasen.txt'
        feature_1 = 'chasen.ref'
        feature_2 = mulu + '_' + 'chasen_1.txt'

        files_dir = os.path.join(file_dir, feature)

        save_dir = os.path.join(file_dir, feature_2)

        with open(files_dir, 'r', encoding='utf-8') as csvfile:

            reader = csv.reader(csvfile)

            column = [row for row in reader]

            column_2 = []

            for xiang in column:
                column_2.append(xiang[0].split())

            column_1 = []
            banyun = []

            for xiang in column_2:

                if len(xiang) == 1:

                    banyun.append(xiang[0])
                    column_1.append(banyun)
                    banyun = []

                else:
                    if xiang[0] != '、' and xiang[0] != '。':
                        if xiang[1] == '未知語':
                            banyun.append(xiang[0] + ' ' +
                                          jaconv.kata2hira(xiang[0]))
                        else:
                            banyun.append(xiang[0] + ' ' +
                                          jaconv.kata2hira(xiang[1]))
                        column_1.append(banyun)
                        banyun = []

            print(column_1)

            with open(save_dir, 'w',
                      encoding='utf-8') as f:  # 把正解文一句一句地写入新的txt文件
                for xieru in column_1:
                    f.writelines(xieru[0] + '\n')
def search_inside_sentence(data, ono_lis_st, ono_counter, option, n):
    with open(data, "r") as f:
        for line in tqdm(f):
            if n == True:
                line = line.rstrip("\n")
            if option == "mid":
                # convert katakana into hiragana
                line = jaconv.kata2hira(line)
                for i in re.findall(ono_lis_st, line):
                    ono_counter[i] += 1
                    # ono_counter = {"pachipachi":1,...}
                    # story = book_id
            elif option == "line_info_n_k":
                # convert katakana into hiragana
                # line = jaconv.kata2hira(line)
                for i in re.findall(ono_lis_st, line):
                    ono_counter[i].append(line)
                    # ono_counter = {"pachipachi":1,...}
                    # story = book_id
            elif option == "line_info_n":
                # convert katakana into hiragana
                line = jaconv.kata2hira(line)
                for i in re.findall(ono_lis_st, line):
                    ono_counter[i].append(line)
                    # ono_counter = {"pachipachi":1,...}
                    # story = book_id
            elif option == "line_info_k":
                # convert katakana into hiragana
                # line = jaconv.kata2hira(line)
                for i in re.findall(ono_lis_st, line):
                    ono_counter[i].append(line)
                    # ono_counter = {"pachipachi":1,...}
                    # story = book_id
            elif option == "line_info":
                # convert katakana into hiragana
                line = jaconv.kata2hira(line)
                for i in re.findall(ono_lis_st, line):
                    ono_counter[i].append(line)
                    # ono_counter = {"pachipachi":1,...}
                    # story = book_id
            elif option == "find_ono_hira":
                # convert katakana into hiragana
                line = jaconv.kata2hira(line)
                for i in re.findall(ono_lis_st, line):
                    ono_counter[i] += 1
                    # ono_counter = {"pachipachi":1,...}
                    # story = book_id
            elif option == "find_ono_kata":
                # convert katakana into hiragana
                # line = jaconv.kata2hira(line)
                for i in re.findall(ono_lis_st, line):
                    ono_counter[i] += 1
                    # ono_counter = {"pachipachi":1,...}
                    # story = book_id
            else:
                raise Exception
    return ono_counter
def unify_text(texts):
    unification_dict = {}
    for text in texts:
        for i, word in enumerate(text):
            if jaconv.kata2hira(word) in unification_dict:
                text[i] = jaconv.kata2hira(word)
            elif jaconv.hira2kata(word) in unification_dict:
                text[i] = jaconv.hira2kata(word)
            else:
                unification_dict[word] = True

    return texts
Пример #10
0
    def romaji_word(self, word):
        """Word is a fugashi node, return a string"""

        if word.surface in self.exceptions:
            return self.exceptions[word.surface]

        if word.surface.isdigit():
            return word.surface

        if is_ascii(word.surface):
            return word.surface

        # deal with unks first
        if word.is_unk:
            # at this point is is presumably an unk
            # Check character type using the values defined in char.def.
            # This is constant across unidic versions so far but not guaranteed.
            if word.char_type == 6 or word.char_type == 7:  # hiragana/katakana
                kana = jaconv.kata2hira(word.surface)
                return self.map_kana(kana)

            # At this point this is an unknown word and not kana. Could be
            # unknown kanji, could be hangul, cyrillic, something else.
            # By default ensure ascii by replacing with ?, but allow pass-through.
            if self.ensure_ascii:
                out = '?' * len(word.surface)
                return out
            else:
                return word.surface

        if word.feature.pos1 == '補助記号':
            # If it's punctuation we don't recognize, just discard it
            return self.table.get(word.surface, '')
        elif (self.use_wa and word.feature.pos1 == '助詞'
              and word.feature.pron == 'ワ'):
            return 'wa'
        elif (not self.use_he and word.feature.pos1 == '助詞'
              and word.feature.pron == 'エ'):
            return 'e'
        elif (not self.use_wo and word.feature.pos1 == '助詞'
              and word.feature.pron == 'オ'):
            return 'o'
        elif (self.use_foreign_spelling and has_foreign_lemma(word)):
            # this is a foreign word with known spelling
            return word.feature.lemma.split('-')[-1]
        elif word.feature.kana:
            # for known words
            kana = jaconv.kata2hira(word.feature.kana)
            return self.map_kana(kana)
        else:
            # unclear when we would actually get here
            return word.surface
Пример #11
0
def delete_both(res):
    l = ""
    while res:
        if res.surface:
            ft = res.feature.split(",")
            if ft[1] != "格助詞" and res.surface != "私":
                if len(ft) > 7:
                    l = l + jaconv.kata2hira(
                        ft[7].decode('utf-8')).encode('utf-8')
                else:
                    l = l + jaconv.kata2hira(
                        res.surface.decode('utf-8')).encode('utf-8')
        res = res.next
    return l
Пример #12
0
def reverse_hirakana(string):
    import jaconv
    if is_hiragana(string):
        string = jaconv.hira2kata(string)
    elif is_katakana(string):
        string = jaconv.kata2hira(string)
    return string
Пример #13
0
    def search_playlist(self, mode: str, text: str) -> List[dict]:
        if self.case_insensitive:
            text = jaconv.kata2hira(text.casefold())
        if self.convert_kanji:
            text = self.bot.converter.do(text)

        result = []

        def find(playlist):
            if mode == 'name':
                if self.case_insensitive:
                    name = jaconv.kata2hira(playlist['name'].casefold())
                else:
                    name = playlist['name']
                if self.convert_kanji:
                    name = self.bot.converter.do(name)
                if text in name:
                    result.append(playlist)
            elif mode == 'id':
                if text in playlist['id'].casefold():
                    result.append(playlist)

        for playlist in self.main_playlists.values():
            find(playlist)
        if len(result) == 0:
            for playlist in self.sub_playlists.values():
                find(playlist)

        return result
Пример #14
0
def search_members(q_info):
    _text = q_info['text']

    # 全角 ⇒ 半角 & ノーマライズ. ex) 'kAげヤmay' => 'kAげヤmay'
    cleaned_text = jaconv.normalize(_text, 'NFKC')

    # カタカナ => ひらがな. ex) 'kAげヤmay' => 'kAげやmay'
    cleaned_text = jaconv.kata2hira(cleaned_text)

    # 大文字 => 小文字. ex) 'kAげやmay' => 'kaげやmay'
    cleaned_text = cleaned_text.lower()

    # 英語 => ひらがな. ex) 'kaげやmay' => {'is_success': False, 'text': 'かげやま'}
    result = otapick.alphabet2kana(cleaned_text)
    if result['is_success']:
        # 全てひらがなの状態
        cleaned_text = result['text']
    else:
        # ひらがな変換が失敗し、
        cleaned_text = result['text']

    # メタ文字(* \ | ? +)をエスケープ
    meta_char_tuple = ('\\', '*', '+', '.', '?', '{', '}', '(', ')', '[', ']',
                       '^', '$', '-', '|', '/')
    for meta_char in meta_char_tuple:
        if meta_char in cleaned_text:
            cleaned_text = cleaned_text.replace(meta_char,
                                                '\\{}'.format(meta_char))

    matched_members = Member.objects.filter(
        Q(full_kana__iregex=r'^%s' % cleaned_text)
        | Q(first_kana__iregex=r'^%s' % cleaned_text)
        | Q(full_kanji__iregex=r'^%s' % cleaned_text)
        | Q(first_kanji__iregex=r'^%s' % cleaned_text)
        | Q(full_eng__iregex=r'^%s' % cleaned_text)
        | Q(first_eng__iregex=r'^%s' % cleaned_text))

    matched_member_keywords = MemberKeyword.objects.filter(
        keyword__iregex=r'^%s' % cleaned_text)

    # keywordもマッチした場合
    if matched_member_keywords.count() > 0:
        matched_member_pk_list = [
            matched_member.pk for matched_member in matched_members
        ]
        matched_keyword_member_pk_list = [
            matched_member_keyword.member.pk
            for matched_member_keyword in matched_member_keywords
        ]
        member_pk_list = list(
            set(matched_member_pk_list +
                matched_keyword_member_pk_list))  # 重複を削除
        members = Member.objects.filter(pk__in=member_pk_list)
    else:
        members = matched_members

    if members.exists():
        return members
    else:
        return
Пример #15
0
def test_jaconv():
    logging.info("=========================================")
    logging.info("=               jaconv                  =")
    logging.info("=========================================")
    test_cases = get_test_cases()
    for tc in test_cases:
        title = tc['title']
        body = tc['body']

        logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title)
        calc_time(jaconv.hira2kata, body)
        logging.debug("result: %s" % jaconv.hira2hkata(body))

        logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title)
        calc_time(jaconv.kata2hira, body)
        logging.debug("result: %s" % jaconv.kata2hira(body))

        logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title)
        calc_time(jaconv.hira2hkata, body)
        logging.debug("result: %s" % jaconv.hira2hkata(body))

        logging.info("半角 to 全角 for %s" % title)
        calc_time(jaconv.h2z, body)
        logging.debug("result: %s" % jaconv.h2z(body))

        logging.info("全角 to 半角 for %s" % title)
        calc_time(jaconv.z2h, body)
        logging.debug("result: %s" % jaconv.z2h(body))
Пример #16
0
def make(circle, lost):
    if circle and lost:
        basetext = "SELECT brandfurigana,brandname FROM brandlist "
    else:
        basetext = "SELECT brandfurigana,brandname FROM brandlist WHERE "
        if circle == False:
            basetext += "kind = 'CORPORATION' "
        if lost == False:
            if circle == False:
                basetext += "AND "
            basetext += "lost = 'FALSE' "
    basetext += "ORDER BY brandfurigana"
    print(basetext)
    url = "http://erogamescape.dyndns.org/~ap2/ero/toukei_kaiseki/sql_for_erogamer_form.php"
    s = requests.session()
    payload = {'sql': basetext}
    r = s.post(url, data=payload)
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table')
    text = ""
    for i, row in enumerate(table.find_all('tr')):
        if i == 0:
            continue
        data = row.find_all('td')
        text += "{furigana}\t{name}\t固有名詞\n".format(furigana=jaconv.kata2hira(
            data[0].string),
                                                    name=data[1].string)
    return text
Пример #17
0
def text2hiragana(text):
    text = unicodedata.normalize("NFKC", text)
    text = normalize_neologd(text)
    text = text.replace(' ', ' ')

    parsed = mecab.parse(text).split('\n')
    parsed = [p.split('\t') for p in parsed]
    way_of_readings = [_special_char_convert(p[1], p[3], idx==0)
                       for idx, p in enumerate(parsed) if len(p) >= 2]
    way_of_reading = "".join(way_of_readings)

    way_of_reading = _num2word(way_of_reading)
    way_of_reading = kakasi_converter.do(way_of_reading)
    way_of_reading = jaconv.kata2hira(way_of_reading)
    way_of_reading = way_of_reading.replace(' ', ' ')
    way_of_reading = re.sub(r'[^ぁ-ゔ。、!?ー\.\!\?,\s]', '', way_of_reading)
    way_of_reading = re.sub(r'\s{2,}', ' ', way_of_reading)
    way_of_reading = re.sub(r'^\s', '', way_of_reading)

    if len(way_of_reading) == 0:
        way_of_reading = '.'
    if way_of_reading[-1] != '.':
        way_of_reading = way_of_reading + '.'
    if way_of_reading[0] != ' ':
        way_of_reading = ' ' + way_of_reading

    return way_of_reading
Пример #18
0
    def _get_date_range_from_jp_era(self, jp_era: str) -> dict:
        """[summary]

        Args:
            jp_era (str): [description]

        Returns:
            dict: [description]
        """
        l_type = self._check_language(jp_era)

        data = None
        for key, value in self.data_dic.items():
            reading = value["reading"]

            reading_jp = reading["jp"]
            reading_en = reading["en"]

            if l_type == "kanji":
                if jp_era == key:
                    data = value
            elif l_type == "katakana" or l_type == "hiragana":
                jp_era = jaconv.kata2hira(jp_era)
                if jp_era == reading_jp:
                    data = value
                    break
            elif l_type == "english":
                if jp_era == reading_en:
                    data = value
                    break
            else:
                break
        return data
Пример #19
0
    def __validate_with_janome(self, kanji, yomi):
        # しくらちよまる /志倉千代丸/ が、「こころざしくらちよまる」になるケースを特別に除外する
        # きしなみかお /岸波香桜/ -> *きしなみかお*りさくら
        # くらちれお /倉知玲鳳/ -> *くらちれお*おとり
        for c in ['志', '香', '鳳']:
            if c in kanji:
                return None

        janome_yomi = jaconv.kata2hira(''.join(
            [n.reading if str(n.reading) != '*' else n.base_form for n in self.tokenizer.tokenize(kanji)]))
        normalized_janome_yomi = normalize_hiragana(janome_yomi)
        normalized_yomi = normalize_hiragana(yomi)

        self.logger.debug(f"yomi={yomi} normalized_yomi={normalized_yomi}, janome_yomi={janome_yomi},"
                          f" normalized_janome_yomi={normalized_janome_yomi}")
        if normalized_yomi in normalized_janome_yomi:
            extra = len(re.sub(normalized_yomi, '', normalized_janome_yomi, 1))

            # 2 に意味はない。
            # 愛植男=あいうえお が janome だと あいうえおとこ になるのの救済をしている。
            if extra > 3:
                return f"kanji may contain extra chars(janome): janome_yomi={janome_yomi}"
            else:
                return None

        return None
Пример #20
0
    def romaji_word(self, word):
        """Word is a fugashi node, return a string"""

        if word.surface in self.exceptions:
            return self.exceptions[word.surface]

        if word.surface.isdigit():
            return word.surface

        if isascii(word.surface):
            return word.surface

        if word.feature.pos1 == '補助記号':
            return self.table[word.surface]
        elif (self.use_wa and word.feature.pos1 == '助詞'
              and word.feature.pron == 'ワ'):
            return 'wa'
        elif (not self.use_he and word.feature.pos1 == '助詞'
              and word.feature.pron == 'エ'):
            return 'e'
        elif (not self.use_wo and word.feature.pos1 == '助詞'
              and word.feature.pron == 'オ'):
            return 'o'
        elif (self.use_foreign_spelling and has_foreign_lemma(word)):
            # this is a foreign word with known spelling
            return word.feature.lemma.split('-')[-1]
        elif word.feature.kana:
            # for known words
            kana = jaconv.kata2hira(word.feature.kana)
            return self.map_kana(kana)
        else:
            return word.surface
Пример #21
0
def split_furigana(text):
    """ MeCab has a problem if used inside a generator ( use yield instead of return  )
    The error message is:
    ```
    SystemError: <built-in function delete_Tagger> returned a result with an error set
    ```
    It seems like MeCab has bug in releasing resource
    """
    mecab = MeCab.Tagger("-Ochasen")
    mecab.parse('')  # 空でパースする必要がある
    node = mecab.parseToNode(text)
    ret = []

    while node is not None:
        origin = node.surface  # もとの単語を代入
        if not origin:
            node = node.next
            continue

        # originが空のとき、漢字以外の時はふりがなを振る必要がないのでそのまま出力する
        if origin != "" and any(is_kanji(_) for _ in origin):
            kana = node.feature.split(",")[7]  # 読み仮名を代入
            hiragana = jaconv.kata2hira(kana)
            for pair in split_okurigana(origin, hiragana):
                ret += [pair]
        else:
            if origin:
                ret += [(origin, )]
        node = node.next
    return ret
Пример #22
0
def get_song_meta(song):
    d = {}
    search = set()
    for k, v in song.meta.items():
        if request.latin:
            d[k] = v[(request.lc, "l")]
        else:
            d[k] = v[request.lc]
    for k in ("title", "artist", "seenon", "album"):
        if k in song.meta:
            v = song.meta[k]
            search.add(normalize(v[request.lc]))
            search.add(normalize(v["k"]))
            search.add(normalize(v["l"]))
            search.add(
                normalize(jaconv.kana2alphabet(jaconv.kata2hira(
                    v["k"]))).replace("ー", ""))
    for k in ("genre", ):
        if k in song.meta:
            v = song.meta[k]
            search.add(normalize(v[request.lc]))
    d["search"] = list(search)
    if request.latin:
        d["sort"] = song.meta["title"][(request.lc, "l")]
    else:
        d["sort"] = song.meta["title"][(request.lc, "k")]
    return d
Пример #23
0
def write_out_textures(row, filename):
    """
    count the frequency of texture of a recipe you want
    """

    # convert kata to hira and add up as count of hira
    # add value[0] to a list if 

    dic = {value[0]:value[1] for value in row}
    # import pdb; pdb.set_trace()

    re_katakana = re.compile(r'[\u30A1-\u30F4]+')
    

    for key, value in dic.items():
        if re_katakana.fullmatch(key):
            kana_to_hira = jaconv.kata2hira(key)
            if kana_to_hira in dic.keys():
                dic[kana_to_hira] += value
                dic[key] = 0

    dic = {k:v for k,v in dic.items() if v > 0}


    with codecs.open(filename, 'a', 'utf-8') as file:

        writer = csv.writer(file, delimiter=',')
        writer.writerow([ 'word', 'count' ])
        for texture, count in dic.items():
            writer.writerow([ texture, count ])
Пример #24
0
def regularize_text(text):
    """数字や句読点を半角に統一

    Args:
        text (Unicode): 日本語含む文字列

    Returns:
        Unicode: 日本語含む文字列
    """
    pairs = [
        ["0", "0"],
        ["1", "1"],
        ["2", "2"],
        ["3", "3"],
        ["4", "4"],
        ["5", "5"],
        ["6", "6"],
        ["7", "7"],
        ["8", "8"],
        ["9", "9"],
        [",", ","],
        ["、", ","],
        [".", "."],
        ["。", "."],
        [":", ":"],
        # ["'", " "],
        # ['"', " "],
        ["払金", ""],
        ["試着", ""],
        # ["金", ""],
        ["々", ""],
        ["”", " "],
        ["“", " "],
        ["(", " "],
        [")", " "],
        ["『", "「"],
        ["』", "」"],
        ["「", "」"],
        ["」", "」"],
        ["/", "/"],
        ["!", "!"],
        ["?", "?"],
        ["●", "まる"],
        ["~", ""],
        ["〜", ""],
        ["…", ""],
        ["…", ""],
        ["《", ""],
        ["》", ""],
    ]
    for p in pairs:
        text = text.replace(p[0], p[1])
    mecab = MeCab.Tagger("-Oyomi")
    text = mecab.parse(text).rstrip()
    text = kata2hira(text)
    text = mojimoji.zen_to_han(text)
    for p in pairs:
        text = text.replace(p[0], p[1])
    return text
Пример #25
0
 def type_by_user(self, user_word):
     user_word = jaconv.kata2hira(user_word)
     last_chr = self.correct(self.com_word)
     if last_chr == user_word[0] and self.judge_last_char(user_word):
         self.user_word = user_word
         self.used_words.append(user_word)
     else:
         self.user_word = ""
Пример #26
0
def basic_preprocess(text):
    # convert digital number and latin to hangaku
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
    # convert kana to zengaku
    text = jaconv.h2z(text, kana=True, digit=False, ascii=False)
    # convert kata to hira
    text = jaconv.kata2hira(text)
    # lowercase
    text = text.lower()
    return text
Пример #27
0
def get_hiragana(word: str) -> str:
    """文字列をひらがなに変換します。

    Args:
        word (str): 変換する文字列

    Returns:
        str: ひらがな文字列
    """
    converted_word = jaconv.kata2hira(word)
    return converted_word
Пример #28
0
def _get_normalized_value(target_value, rule):
    v = target_value
    if rule.get('match_zen_han', False):
        v = _normalize2zen(v)
    if rule.get('match_kata_hira', False):
        v = jaconv.kata2hira(v)
    if rule.get('match_eng_jpn', False):
        v = _normalize2kun(v)
    if rule.get('case_insensitive', False):
        v = v.lower()
    return v
Пример #29
0
    def search_style(self, id: str, text: str) -> List[dict]:
        if self.case_insensitive:
            text = jaconv.kata2hira(text.casefold())
        if self.convert_kanji:
            text = self.bot.converter.do(text)

        styles = self.get_style(id)

        result = []

        for style in styles:
            name = style['name']
            if self.case_insensitive:
                name = jaconv.kata2hira(name.casefold())
            if self.convert_kanji:
                name = self.bot.converter.do(name)
            if text in name:
                result.append(style)

        return result
Пример #30
0
def convert_file(src, dst):
    with open(src, encoding="utf-8") as srcf:
        with open(dst, "w", encoding="utf-16", newline="\r\n") as dstf:
            for line in srcf:
                elems = line.rstrip().split("\t")
                tango = elems[0]
                if len(elems) == 1:
                    yomis = [jaconv.kata2hira(tango)]
                else:
                    yomis = elems[1:]
                for yomi in yomis:
                    dstf.write(f"{yomi}\t{tango}\t固有名詞\n")
Пример #31
0
def normalize_word(word):
    """
    For identify same word
    """
    word = jaconv.kata2hira(word)
    return word.lower()
Пример #32
0
def test_kata2hira():
    assert_equal(jaconv.kata2hira('巴マミ'), '巴まみ')
    assert_equal(jaconv.kata2hira('マミサン', ignore='ン'), 'まみさン')
    _compare(jaconv.kata2hira, FULL_KANA, HIRAGANA)
Пример #33
0
 def reading_hira(self):
     return kata2hira(self.reading)