def preprocessing(text: str) -> str: result: str = text # 全角 -> 半角 result = mojimoji.zen_to_han(result, kana=False) # number -> kanji result = re.sub( r'\d+', lambda m: kanjize.int2kanji(int(m.group(0))), result, ) # remove '笑' result = re.sub( r'[a-z]+', lambda m: '' if re.match(r'^w+$', m.group(0), re.IGNORECASE) else m.group(0), result, flags=re.IGNORECASE) # remove symbolic char result = re.sub(r'[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]', '', result) result = re.sub(r'[!-/:-@[-`{、。”’・ ]', '', result) # remove emoji result = ''.join(ch for ch in result if ch not in emoji.UNICODE_EMOJI['en']) return result
def to_katakana(sentence, rm_ltu=False): ## -----*----- カタカナ変換 -----*----- ## ''' sentence:判定対象の文 rm_ltu:「っ」を削除するかどうか ''' katakana = '' # 数字 -> 漢数字 while re.match('\d+', sentence): c = re.match('\d+', sentence).group() sentence = sentence.replace(c, int2kanji(int(c))) # 形態素解析 for token in t.tokenize(sentence): s = token.reading if s == '*': # 読みがわからないトークン if re.match('[ぁ-んァ-ンー]', token.surface) != None: katakana += token.surface else: # 読みがわかるトークン if re.match('[ァ-ン]', s) != None: katakana += s if rm_ltu: katakana = katakana.replace('ッ', '') pair = ['ぁぃぅぇぉっゃゅょゎァィゥェォッャュョヮヂ', 'アイウエオツヤユヨワアイウエオツヤユヨワジ'] for i in range(len(pair[0])): katakana = katakana.replace(pair[0][i], pair[1][i]) return katakana
def Ditital2Hiragana(sentence): digital_get=re.compile('¥¥d+') digital_dix2list=list(digital_get.finiter(sentence)) for rep in digital_idx2list: dig2int=int(rep.group()) dig2kanji=int2kanji(dig2int) dig2hira=self.kanji2hiragana(dig2kanji) sentence=sentence.replace(str(dig2int),dig2hira)
def parse_xlsx(number, data): df = pd.read_excel(data, header=HEADER_ROW) kanji1 = int2kanji(int(number)) kanji2 = "".join(map(lambda x: '〇一二三四五六七八九'[int(x)], str(number))) query = "" for c in [str(number), kanji1, kanji2]: query = query + c + "|" query = query.rstrip('|') rd = df[df[Header.number].str.contains(query, na=False)] return rd
def English_to_Kana(str, fname): read = str.strip() english = re.compile('[a-zA-Z]+') words = english.findall(read) if((len(words) >= 1) & (fname == "EngDict_")): for w in words: if(len(w) == 1): furigana = alphabet.get(w.upper()) read = read.replace(w, furigana) else: count = 0 for i in range(len(w)): if (w[i] >= "A") & (w[i] <= "Z"): count+=1 if(count >= len(w)/2): for i in range(len(w)): furigana = alphabet.get(w[i].upper()) if((w[i] in read) | (furigana != None)): read = read.replace(w[i], furigana) # else: # e2k = etok.EnglishToKana() # furigana = e2k.convert(w) # if(furigana != "ERROR 辞書にありません"): # read = read.replace(w, furigana) # 英語訳しりとりモードの辞書作成 else: if(len(words) >= 1): for w in words: if(len(w) == 1): furigana = alphabet.get(w.upper()) read = read.replace(w, furigana) else: count = 0 for i in range(len(w)): count+=1 if (w[i] >= "A") & (w[i] <= "Z") else 0 if(count >= len(w)/2): for i in range(len(w)): furigana = alphabet.get(w[i].upper()) if((w[i] in read) | (furigana != None)): read = read.replace(w[i], furigana) else: read = "" numbers = re.findall(r'[0-9]+\.?[0-9]*', read) if(len(numbers) >= 1): for n in numbers: if("." in n): for i in range(len(n)): if((n[i] == ".") | (n[i] == "0")): furigana = Number.get(n[i]) else: furigana = int2kanji(int(n[i])) read = read.replace(n[i], furigana) else: furigana = int2kanji(int(n)) read = read.replace(n, furigana) for s in Symbols.keys(): if(s in Symbols): read = read.replace(s, Symbols[s]) if(read != ""): read = conv.do(read) # if((read[0] >= "あ") & (read[0] <= "ん") | (read[-1] >= "あ") & (read[-1] <= "ん")): return str+","+conv.do(read) # else: # return None else: return None
def n575_detector(text): cmd = 'echo `mecab-config --dicdir`"/mecab-ipadic-neologd"' path = (subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]).decode('utf-8') m = MeCab.Tagger("-d {0}".format(path)) text = text.replace(' ', '') text = text.replace('¥n', '') node = m.parseToNode(text) all_list = [] while node: hinshi = node.feature.split(",")[0] if hinshi != "BOS/EOS": try: int(node.surface.split()[0]) except ValueError: all_list.append(node.surface.split() + node.feature.split(",")) else: # 数字はMecab君が読めないので、漢数字に一旦直してlistに追加する number_parsed = m.parse(int2kanji(int( node.surface.split()[0]))).split() tmp = number_parsed[1].split(",") tmp.insert(0, node.surface.split()[0]) all_list.append(tmp) node = node.next now_mora = 0 haiku = "" first_index = 0 haiku_list = [] for i in all_list: if i[1] == "記号": haiku += i[0] continue if i[7] == "*" and len(i) != 10: continue else: mora = yomi_to_mora(i[9]) now_mora += mora haiku += i[0] if now_mora > 17: while 1: first = all_list[first_index] if first[1] == "記号": first_index += 1 haiku = haiku.lstrip(first[0]) elif first[7] == "*" and len(first) != 10: continue else: now_mora -= yomi_to_mora(first[9]) haiku = haiku.lstrip(first[0]) first_index += 1 if now_mora <= 17: break if now_mora == 17: first = all_list[first_index] end = i if first[1] == "名詞" and first[2] != "非自立": if end[1] == "動詞" or end[1] == "形容詞" or end[ 1] == "形容動詞" or end[1] == "名詞" or end[1] == "助動詞": print("n575: " + haiku) haiku_list.append(haiku) return haiku_list
def testint2kanji(self): self.assertEqual(int2kanji(1), "一", "all") self.assertEqual(int2kanji(10), "十", "all") self.assertEqual(int2kanji(11), "十一", "all") self.assertEqual(int2kanji(111), "百十一", "all") self.assertEqual(int2kanji(211), "二百十一", "all") self.assertEqual(int2kanji(121), "百二十一", "all") self.assertEqual(int2kanji(11), "十一", "all") self.assertEqual(int2kanji(111), "百十一", "all") self.assertEqual(int2kanji(211), "二百十一") self.assertEqual(int2kanji(121), "百二十一") self.assertEqual(int2kanji(1000), "千") self.assertEqual(int2kanji(1001), "千一") self.assertEqual(int2kanji(2025), "二千二十五") self.assertEqual(int2kanji(58076099), "五千八百七万六千九十九") # added 1.0.0 self.assertEqual( int2kanji(10000000000000000000000000000000000000000000000000000), "一恒河沙") self.assertEqual( int2kanji( 999999999999999999999999999999999999999999999999999999999999999999999999 ), ("九千九百九十九無量大数九千九百九十九不可思議九千九百九十九那由多九千九百九十九阿僧祇九千九百九十九恒河沙九千九百九十九極" "九千九百九十九載九千九百九十九正九千九百九十九澗九千九百九十九溝九千九百九十九穣九千九百九十九𥝱九千九百九十九垓" "九千九百九十九京九千九百九十九兆九千九百九十九億九千九百九十九万九千九百九十九")) self.assertEqual("1", int2kanji(1, style="mixed")) self.assertEqual("10", int2kanji(10, style="mixed")) self.assertEqual("11", int2kanji(11, style="mixed")) self.assertEqual("121", int2kanji(121, style="mixed")) self.assertEqual("1千", int2kanji(1000, style="mixed")) self.assertEqual("1001", int2kanji(1001, style="mixed")) self.assertEqual("2025", int2kanji(2025, style="mixed")) self.assertEqual("5807万6099", int2kanji(58076099, style="mixed")) self.assertEqual("223兆4235億4256万6千", int2kanji(223423542566000, style="mixed")) self.assertEqual("5千京", int2kanji(50000000000000000000, style="mixed")) self.assertEqual( "5000京", int2kanji(50000000000000000000, style="mixed", kanji_thousand=False)) self.assertEqual("39京4385兆4895万", int2kanji(394385000048950000, style="mixed")) self.assertEqual("223兆4千億4256万6千", int2kanji(223400042566000, style="mixed")) self.assertEqual( "223兆4000億4256万6000", int2kanji(223400042566000, style="mixed", kanji_thousand=False)) # added 1.0.0 self.assertEqual( "1恒河沙", int2kanji(10000000000000000000000000000000000000000000000000000, style="mixed", kanji_thousand=False)) self.assertEqual( "9999無量大数9999不可思議9999那由多9999阿僧祇9999恒河沙9999極9999載9999正9999澗9999溝9999穣9999𥝱9999垓9999京9999兆9999億9999万9999", int2kanji( 999999999999999999999999999999999999999999999999999999999999999999999999, style='mixed'))
def __init__(self, title=None, number=None, sentence=None, children=None): if (not title) and number: title = '第{}項'.format(int2kanji(int(number))) super().__init__(title, children) self.number = number if number else 1 self.sentence = sentence if sentence else ''