def convert_JapaneseYear_to_CommonEra(self, warekiYear): pattern = re.compile( '^(|.+)(明治|大正|昭和|平成|令和)(|\u3000| )([元0-90-9]+)年(|\u3000| )([0-90-9]+)月(|\u3000| )([0-90-9]+)日(|.+)$', re.MULTILINE) matches = pattern.search(warekiYear) era_name = matches.group(2) year = matches.group(4) month = jaconv.z2h(matches.group(6), digit=True) month = month.zfill(2) day = jaconv.z2h(matches.group(8), digit=True) day = day.zfill(2) if year == '元': year = 1 else: if sys.version_info < (3, 0): year = year.decode('utf-8') year = int(jaconv.z2h(year, digit=True)) if era_name == '明治': year += 1867 elif era_name == '大正': year += 1911 elif era_name == '昭和': year += 1925 elif era_name == '平成': year += 1988 elif era_name == '令和': year += 2018 return str(year) + month + day
def class_info(tr): d = dict() yobi = ['月', '火', '水', '木', '金', '土', '日'] season = ['春', '夏', '秋', '冬'] for i, td in enumerate(tr.find_elements_by_tag_name('td')): try: if i == 0: d['year'] = int(td.text) elif i == 2: d['name'] = td.text d['key'] = td.find_element_by_tag_name('a').get_attribute( 'onclick').split("'")[3] elif i == 5: d['term'] = '0123' if td.text == '通年' else str( season.index(td.text[0])) if td.text[1:] == '学期': d['term'] += f'{int(d["term"]) + 1}' elif i == 6: d['weekday'] = int(yobi.index(td.text[0])) d['period'] = int( jaconv.z2h(td.text[1], digit=True, ascii=True)) elif i == 7: info = td.text.split('-') if info[0].isdecimal() and (info[1][:3].isdecimal() or info[1][0] == 'B'): d['building'] = int( jaconv.z2h(info[0], digit=True, ascii=True)) d['room'] = jaconv.z2h(info[1][:3], digit=True, ascii=True) else: raise Exception except: # print('error on ', tr.text) return None return d
def text_ins_reg(ins): # 検索置換の開始。 # ### 全角数字を半角に変換する。 ins = jaconv.z2h(ins, kana=False, ascii=False, digit=True) # ### 数字の桁区切りが全角だった場合 => 半角に変換。 ins = re.sub('^[ +]|[ +]$', '', ins) # ### こちらは『''』の前に『r』がなくてもグループ化と正規化がうまくいってる? ins = re.sub('(?<=\d),(?=\d+)', '\1,\2', ins) # ### 句点読点を統一。通常文章バージョン ins = ins.replace(',', '、') # 理科系バージョン => '、', ',' ins = ins.replace('.', '。') # 理科系バージョン => '。', '.' # ### 全角ASCIIを半角に変換する。 # 全角スペースを下駄に変換、全角ASCIIを半角に変換、下駄を全角スペースに戻す。 ins = ins.replace(' ', '〓') ins = jaconv.z2h(ins, kana=False, ascii=True, digit=False) ins = ins.replace('〓', ' ') # ### 半角カタカナを全角に変換する。 ins = jaconv.h2z(ins) # ### ASCIIの『()』『[]』を全角に変える。 # ### 『''』の前に『r』を付けることについて、規則が全く理解できない! ins = re.sub('\((.+?)\)', r'(\1)', ins) ins = re.sub('\[(.+?)\]', r'[\1]', ins) # ### 時間表示の『:』を全角に変換する。 ins = re.sub('(\d{1,2}):(\d{2})', r'\1:\2', ins) # ### 箇条書き先頭の数字周りの全角ピリオドをママ活かす。 # ### ### 句点だった場合 ins = re.sub('^(\d{1,3})。', r'\1.', ins, flags=re.MULTILINE) # ### ### Piriodの場合 ins = re.sub('^(\d{1,3})\.\s', r'\1.', ins, flags=re.MULTILINE) # ### 問題点 文字列前後の不要なスペースを取り除けない。 # ins = ins.strip() ins = re.sub('^\s+', r'', ins, flags=re.MULTILINE) # ############################################## return ins
def get_title(): global titles global dates dates.clear() for title in titles: title = title.split('年')[1] month = title.split('月')[0] day = title.split('月')[1].split('日')[0] month = jaconv.z2h(month, digit=True, ascii=True) day = jaconv.z2h(day, digit=True, ascii=True) date = month + '/' + day dates.append(date)
def kanja_scraping(): r = requests.get( "https://www.pref.yamanashi.jp/koucho/coronavirus/info_coronavirus_prevention.html" ) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") h2 = soup.find(get_title) data = [] s = "" # 下向きに同レベルのタグを抽出 for tag in h2.find_next_siblings(): if tag.name == "h4": data.append( jaconv.z2h(s.rstrip(), kana=False, digit=True, ascii=True)) s = "" elif tag.name == "h2": data.append( jaconv.z2h(s.rstrip(), kana=False, digit=True, ascii=True)) break s += tag.get_text(strip=True) + "\n" result = [] for d in data[1:]: # m = re.match("^.+$", d, re.MULTILINE) m = re.match(r"県内\d{1,3}例目", d) if m: temp = {"No": m.group(0)} for i in re.finditer(r"(発生判明日|年代|性別|居住地):(.+)$", d, re.MULTILINE): temp[i.group(1)] = i.group(2) if i.group(1) == "居住地": t = copy.deepcopy(temp) t["リリース日"] = wareki2date(t["発生判明日"]).isoformat() del t["発生判明日"] t["退院"] = None result.append(t) return result[::-1]
def handleM4A(path): # ./GENRE/Compilations/ARTIST/ALBUM/SONG.m4a temp = path.replace("\\", "/") #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ song = temp[temp.rfind("/") + 1:] song = jaconv.z2h(song, kana=False, digit=True, ascii=True) song = jaconv.h2z(song, kana=True, digit=False, ascii=False) temp = temp[:temp.rfind("/")] #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ album = temp[temp.rfind("/") + 1:] album = jaconv.z2h(album, kana=False, digit=True, ascii=True) album = jaconv.h2z(album, kana=True, digit=False, ascii=False) temp = temp[:temp.rfind("/")] #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ artist = temp[temp.rfind("/") + 1:] artist = jaconv.z2h(artist, kana=False, digit=True, ascii=True) artist = jaconv.h2z(artist, kana=True, digit=False, ascii=False) temp = temp[:temp.rfind("/")] #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ genre = temp[temp.rfind("/") + 1:] genre = jaconv.z2h(genre, kana=False, digit=True, ascii=True) genre = jaconv.h2z(genre, kana=True, digit=False, ascii=False) temp = temp[:temp.rfind("/")] #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ # take artist as Compilations category = temp[temp.rfind("/") + 1:] temp = temp[:temp.rfind("/")] if category == "__02_Compilations__": artist = "__Compilations__" elif category == "__01_Favorites__": pass #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ mp4 = MP4(path) #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ MyLogger.info(path) mp4.tags[TRACK_TITLE] = song mp4.tags[ALBUM] = album mp4.tags[ALBUM_ARTIST] = artist mp4.tags[ALBUM_SORT_ORDER] = conv.do(album) mp4.tags[ARTIST] = artist mp4.tags[ARTIST_SORT_ORDER] = conv.do(artist) mp4.tags[GENRE] = genre MyLogger.info("mp4.tags[TRACK_TITLE]", str(mp4.tags[TRACK_TITLE])) MyLogger.info("mp4.tags[ALBUM]", str(mp4.tags[ALBUM])) MyLogger.info("mp4.tags[ALBUM_ARTIST]", str(mp4.tags[ALBUM_ARTIST])) MyLogger.info("mp4.tags[ALBUM_SORT_ORDER]", str(mp4.tags[ALBUM_SORT_ORDER])) MyLogger.info("mp4.tags[ARTIST]", str(mp4.tags[ARTIST])) MyLogger.info("mp4.tags[ARTIST_SORT_ORDER]", str(mp4.tags[ARTIST_SORT_ORDER])) MyLogger.info("mp4.tags[GENRE]", str(mp4.tags[GENRE]))
def csv_reg(df_in): rows = [] # valuesメソッドで1行ごとに文字列を整理していく。 for row in df_in.values: for i, cell in enumerate(row): # ########################################################### # #####問題点 ##### # #####if条件式でcellの中身が空なら次に処理を促すコードにしたい。##### # ########################################################### if not isinstance(cell, str): if np.isnan(cell): continue cell = str(cell) # 検索置換の開始。 # ### 全角数字を半角に変換する。 cell = jaconv.z2h(cell, kana=False, ascii=False, digit=True) # ### 数字の桁区切りが全角だった場合 => 半角に変換。 # ### こちらは『''』の前に『r』がなくてもグループ化と正規化がうまくいってる? cell = re.sub('(?<=\d),(?=\d+)', '\1,\2', cell) # ### 句点読点を統一。通常文章バージョン cell = cell.replace(',', '、') # 理科系バージョン => '、', ',' cell = cell.replace('.', '。') # 理科系バージョン => '。', '.' # ### 全角ASCIIを半角に変換する。 # 全角スペースを■に変換、全角ASCIIを半角に変換、■を全角スペースに戻す。 cell = cell.replace(' ', '■') cell = re.sub('[〜~]', '〓から〓', cell) cell = jaconv.z2h(cell, kana=False, ascii=True, digit=False) cell = cell.replace('■', ' ') cell = cell.replace('〓から〓', '〜') # ### 半角カタカナを全角に変換する。 cell = jaconv.h2z(cell) # ### ASCIIの『()』『[]』を全角に変える。 # ### 『''』の前に『r』を付けることについて、規則が全く理解できない! cell = re.sub('\((.+?)\)', r'(\1)', cell) cell = re.sub('\[(.+?)\]', r'[\1]', cell) # ### 時間表示の『:』を全角に変換する。 cell = re.sub('(\d{1,2}):(\d{2})', r'\1:\2', cell) # ### 先頭の欧文スペースを取る cell = re.sub('^ ', '', cell) # ### 箇条書き先頭の数字周りの全角ピリオドをママ活かす。 # ### ### 句点だった場合 cell = re.sub('^(\d{1,3})。', r'\1.', cell, flags=re.MULTILINE) # ### ### Piriodの場合 cell = re.sub('^(\d{1,3})\.\s', r'\1.', cell, flags=re.MULTILINE) # ### セル内改行、及び、文字列前後の不要なスペースを取り除く。 row[i] = cell.replace('\n', '▽').strip() rows.append(row) # 元のヘッダーをここで設置しなおす。 df = pd.DataFrame(rows, columns=df_in.columns) return df
def get_patients_last_update(self) -> str: # patients_sheetから"M/D H時現在"の形式で記載されている最終更新日時を取得する # クラスターが増えれば端に寄っていき、固定値にすると取得できないので、whileで探索させている column_num = 16 data_time_str = "" while not data_time_str: if not self.patients_sheet.cell(row=3, column=column_num).value: column_num += 1 continue # 数字に全角半角が混じっていることがあるので、半角に統一 data_time_str = jaconv.z2h(str(self.patients_sheet.cell(row=3, column=column_num).value), digit=True, ascii=True) plus_day = 0 # datetime.strptimeでは24時は読み取れないため。24時を次の日の0時として扱わせる if data_time_str[-5:] == "24時現在": # 12/31や1/1など、文字数の増減に対応するため、whileで探索させている count = 8 while True: try: day_str, hour_str = data_time_str[-count:].split() if day_str.startswith("/"): raise break except Exception: count -= 1 data_time_str = data_time_str[:-count] + day_str + " 0時現在" plus_day = 1 # 最後に、頭に"2020/"を付け加えてdatetimeに読み取らせている # 2021年になった時などどうするかは未定 # TODO: 年が変わった場合の対応 last_update = datetime.strptime("2020/" + data_time_str, "%Y/%m/%d %H時現在") + timedelta(days=plus_day) return last_update.replace(tzinfo=jst).isoformat()
def zen2han(input): # 半角カナにない特殊文字をまず変換 buf = [] for x in input: if x in ('ゐ', 'ヰ'): y = '\u0010' elif x in ('ゑ', 'ヱ'): y = '\u0011' elif x == 'ヵ': y = '\u0012' elif x == 'ヶ': y = '\u0013' elif x in ('ゎ', 'ヮ'): y = '\u0014' else: y = x buf.append(y) output = "".join(buf) # 半角カタカナに変換 output = jaconv.z2h(jaconv.hira2kata(output), kana=True, digit=True, ascii=True) output = output.replace('゛', '゙') # 全角濁点を半角濁点に output = output.replace('゜', '゚') # 全角半濁点を半角半濁点に return output
def check_amount(recipesoup,key_number,check_ingredient): time.sleep(1) find_amount=recipesoup.find_all('div',class_="ingredient_quantity amount") def ingredient_amount(find_amount): list=[] for amount in find_amount: list.append(amount.string) return(list) ingredient_amount_list=ingredient_amount(find_amount) # 材料と量を合わせて辞書にする ingredient_dict={} for i,j in zip(ingredient_name_list,ingredient_amount_list): ingredient_dict[i]=j if bool(re.fullmatch(ingredient_dictlist[key_number-1]["name"],i))==True: # 量を材料リストから取り出して半角にしたものを格納する check_amount=jaconv.z2h(ingredient_dict[i],digit=True,ascii=True) if check_ingredient==0: check_amount="" # 量を材料リストから取り出して半角にしたものを格納する # 量に数字がないもの、()が含まれているものをNGとする if bool(re.findall(r"\d",check_amount))==False or bool(re.findall("~|\(|\)",check_amount))==True: judge_amount=0 else: judge_amount=1 # 材料名と量を組み合わせた辞書と指定された材料名と量のチェックの結果が返ってくる # 量のチェックの結果が1 → 採用 return ingredient_dict,judge_amount
def getReceipt(sim_pred, resultMap, i): print('input receipt {}'.format(sim_pred)) if sim_pred.find('年') > -1: return if priceUtils.checkMnyStr(sim_pred): return sim_pred = sim_pred.replace('ー', '-').replace('。', '.') sim_pred = jaconv.z2h(sim_pred, digit=True, ascii=True) if sim_pred.find('-') == -1: return # sim_pred = numberUtils.numberReplacement(sim_pred) if (sim_pred.find('-') == 0): sim_pred = str(1) + sim_pred tmpList = sim_pred.split('-') tmpHead = tmpList[0] tmpHead = tmpHead[-1] tmpHead = numberUtils.numberReplacement(tmpHead) lstHead = re.findall(r'\d+', tmpHead) tmpHead = ''.join(lstHead) if (tmpHead == ''): tmpHead = '1' tmpTail = tmpList[-1][:4] tmpTail = numberUtils.numberReplacement(tmpTail) lstTail = re.findall(r'\d+', tmpTail) tmpTail = ''.join(lstTail) if tmpTail == '': tmpTail = '1234' resultMap['6_receiptNO'] = tmpHead + 'ー' + tmpTail # resultMap['6_receiptNO']=tmpHead+'-'+tmpTail print('output receiptNO {}'.format(resultMap['6_receiptNO']))
def re_cellstr(str): # 文字列前後の空白を削除。 str = str.strip() # セル内改行を取り除く。 str = str.replace('\n', '▽') # 半角カタカナを全角に変換する。 str = jaconv.h2z(str) # 全角のアスキーをASCIIへ変換(スペースもASCIIになる。)。 # 『〜』カラ str = re.sub('[〜~]', '〓から〓', str) # 『()』カッコ str = re.sub('(()(.+?)())', r'〓Rカッコ〓\2〓Rカッコ〓', str) str = re.sub('([)(.+?)(])', r'〓Bカッコ〓\2〓Bカッコ〓', str) str = jaconv.z2h(str, kana=False, ascii=True, digit=False) # 『〜』カラ復号 str = str.replace('〓から〓', '〜') # 『()』カッコ復号 str = re.sub('(〓Rカッコ〓)(.+?)(〓Rカッコ〓)', r'(\2)', str) str = re.sub('(〓Bカッコ〓)(.+?)(〓Bカッコ〓)', r'(\2)', str) # スペース(複数含む)をスペース一つに変換。 str = re.sub("\s+", " ", str) # コラムが右に1列増えるのを防ぐため。 str = re.sub(",", "/", str) return str
def eval_force_romaji_to_kana_v2(self, text, kana_ref, nbest=20): p_ = jaconv.z2h(text, digit=True, ascii=True, kana=False) # hankaku p_ = jaconv.normalize(p_, "NFKC") p = jaconv.alphabet2kata(p_) # romanize as possible if p_ == p: return 12345 return self.eval_normal(p, kana_ref, nbest)
def replaceName(string): string = hira2kata(z2h(string, digit=True, kana=False)) for tmp_string in trans_string_table: string = string.replace(tmp_string[0], tmp_string[1]) string = string.translate(trans_table) string = re.sub(replace_string, "", string) return string
def getCategoryAfter(tmpResult,resultMap,i): tmpResult = jaconv.z2h(tmpResult, digit=True, ascii=True) # not minus ー if(tmpResult.find('責')>-1 \ or tmpResult.find('No')>-1 \ or tmpResult.find('点')>-1 \ or tmpResult.find('×')>-1 \ # or tmpResult.find('-')>-1 \ or tmpResult.find('ー')>-1 \ or tmpResult.find(':')>-1 \ or tmpResult.find('NO')>-1): return if not priceUtils.checkMnyStr(tmpResult): return print('input_{} category {}'.format(i,tmpResult)) tmpResult=numberUtils.getMny(tmpResult) if(tmpResult==''): return tmpResult=numberUtils.numberReplacement(tmpResult) lstCatPrice=re.findall(r'\d+', tmpResult) sCatPrice=''.join(lstCatPrice) if(sCatPrice==''): iCatPrice=0 else: iCatPrice=int(sCatPrice) if tmpResult.find('-')>-1: iCatPrice*=-1 if(iCatPrice !=0): resultMap['suffix_catPrice'].append(iCatPrice) print('output iCatPrice------------- {}'.format(iCatPrice))
def test_jaconv(): logging.info("=========================================") logging.info("= jaconv =") logging.info("=========================================") test_cases = get_test_cases() for tc in test_cases: title = tc['title'] body = tc['body'] logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title) calc_time(jaconv.hira2kata, body) logging.debug("result: %s" % jaconv.hira2hkata(body)) logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title) calc_time(jaconv.kata2hira, body) logging.debug("result: %s" % jaconv.kata2hira(body)) logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title) calc_time(jaconv.hira2hkata, body) logging.debug("result: %s" % jaconv.hira2hkata(body)) logging.info("半角 to 全角 for %s" % title) calc_time(jaconv.h2z, body) logging.debug("result: %s" % jaconv.h2z(body)) logging.info("全角 to 半角 for %s" % title) calc_time(jaconv.z2h, body) logging.debug("result: %s" % jaconv.z2h(body))
def convert(self, sent): sent = jaconv.z2h(sent, kana=False, ascii=True, digit=True) iters = re.finditer(r'([a-zA-Z][a-zA-Z\s]*)$', sent) output_word = "" pos = 0 for i in iters: s_pos, e_pos = i.span() word = i.groups()[0] word = re.sub('^\s', r'', word) word = re.sub('\s$', r'', word) s_word = "" while pos < s_pos: output_word += sent[pos] pos += 1 if word in self.med_dic: s_word = self.med_dic[word] elif word.lower() in self.med_dic: s_word = self.med_dic[word.lower()] else: s_word = word if s_word == '': s_word = word output_word += s_word pos = e_pos while pos < len(sent): output_word += sent[pos] pos += 1 return jaconv.h2z(output_word, kana=True, ascii=True, digit=True)
def test_z2h(): assert_equal(jaconv.z2h('ティロフィナーレ'), 'ティロフィナーレ') assert_equal(jaconv.z2h('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ') _compare(partial(jaconv.z2h, kana=True), FULL_KANA, HALF_KANA) _compare(partial(jaconv.z2h, ascii=True), FULL_ASCII, HALF_ASCII) _compare(partial(jaconv.z2h, digit=True), FULL_DIGIT, HALF_DIGIT) for ascii in (True, False): for digit in (True, False): for kana in (True, False): assert_equal( jaconv.z2h(_concat(FULL_KANA if kana else HALF_KANA, FULL_ASCII if ascii else HALF_ASCII, FULL_DIGIT if digit else HALF_DIGIT), ascii=ascii, digit=digit, kana=kana), _concat(HALF_KANA, HALF_ASCII, HALF_DIGIT))
def test_z2h(): assert_equal(jaconv.z2h('ティロフィナーレ'), 'ティロフィナーレ') assert_equal(jaconv.z2h('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ') _compare(partial(jaconv.z2h, kana=True), FULL_KANA, HALF_KANA) _compare(partial(jaconv.z2h, ascii=True), FULL_ASCII, HALF_ASCII) _compare(partial(jaconv.z2h, digit=True), FULL_DIGIT, HALF_DIGIT) for ascii in (True, False): for digit in (True, False): for kana in (True, False): assert_equal( jaconv.z2h(_concat(FULL_KANA if kana else HALF_KANA, FULL_ASCII if ascii else HALF_ASCII, FULL_DIGIT if digit else HALF_DIGIT), ascii=ascii, digit=digit, kana=kana), _concat(HALF_KANA, HALF_ASCII, HALF_DIGIT))
def _extract_lemmatized_word( word_info, use_jaconv=False, pos_list=[], exclude_numbers=False, exclude_symbols=False ): word_info = word_info.split('\t') if len(word_info) > 1: word_details = word_info[1].split(',') if pos_list != [] and word_details[0] not in pos_list: return '' if exclude_numbers and word_details[1] == '数': return '' if exclude_symbols and word_details[0] == '記号': return '' if len(word_details) > 6 and word_details[6] != '*': word = word_details[6] else: word = word_info[0] else: word = word_info[0] if use_jaconv: word = jaconv.z2h(word, digit=True, ascii=True) word = jaconv.normalize(word) return word
def _split_geocode(geo): r = re.match(r"(\d*)(.*)", geo.strip()) if r is None: return "", geo.strip() # code not found else: code = jaconv.z2h(r.group(1), digit=True) geoname = r.group(2) return code, geoname
def normalize_line(line): half = jaconv.z2h(jaconv.normalize(line), kana=False, digit=True, ascii=True) stripped = half.strip() dashed = dash_re.sub('-', stripped) return ws_re.sub(' ', dashed)
def normalize_txt(text): return jaconv.h2z(jaconv.z2h(text.strip(), kana=False, digit=True, ascii=True), kana=True, digit=False, ascii=False).lower()
def my_parser(s): dt_str = jaconv.z2h(s.strip(), kana=False, digit=True, ascii=True) y = dt_now.year m, d = map(int, re.findall(r"(\d{1,2})", dt_str)) return pd.Timestamp(year=y, month=m, day=d)
def normalize_char_width(string: str) -> str: """Normalize character widths in string to a set standard. Converts all katakana to full-width, and converts all latin alphabet and numeric characters to half-width """ out_str = jaconv.h2z(string, kana=True, ascii=False, digit=False) out_str = jaconv.z2h(out_str, kana=False, ascii=True, digit=True) return out_str
def clean_text(self, content): content = jaconv.z2h(content, kana=False, digit=True, ascii=True) content = self.content_repatter1.sub("<URL>", content) content = self.content_repatter2.sub("<EMAIL>", content) content = self.content_repatter3.sub("<TEL>", content) content = self.content_repatter4.sub("<DATE>", content) content = self.content_repatter5.sub("<DATE>", content) content = self.content_repatter6.sub("<PRICE>", content) return content
def _normalize_age(a): a = re.sub(r"\s+", "", a) a = a.replace("歳", "").replace("以上", "+") a = jaconv.z2h(a, digit=True) r = re.match(r"^(\d+)\-(\d+)$", a) if not r: return a else: return "%02d-%02d" % (int(r.group(1)), int(r.group(2)))
def _normalize_ja_text(text: str) -> str: """Converts full-width alphabet/digit characters to half-width characters. Args: text: Text to be transformed. Returns: Transformed text. """ return jaconv.z2h(text, kana=False, ascii=True, digit=True)
def basic_preprocess(text): # convert digital number and latin to hangaku text = jaconv.z2h(text, kana=False, digit=True, ascii=True) # convert kana to zengaku text = jaconv.h2z(text, kana=True, digit=False, ascii=False) # convert kata to hira text = jaconv.kata2hira(text) # lowercase text = text.lower() return text
def check_persons(recipesoup): # 人数が明記してあるタグをセット find_person=recipesoup.find('div',class_="content") persons=0 for child in find_person: i=jaconv.z2h(child.string,digit=True,ascii=True) if child==find_person.find('span',class_="servings_for yield") and bool(re.search(r"\d",i))==True: persons=re.search(r"\d",i).group() # 何人前かを数字で返す # 人数を表すタグがある かつ 数字がある (0ではない)→ 採用 return persons
def change_str(input_string): # カナは全角に変換 changed_line = jaconv.h2z(input_string) # 数字、記号、アルファベットの全角を半角に変換 changed_line = jaconv.z2h(changed_line, kana=False, digit=True, ascii=True) # 前後のスペースと改行コードを除外 changed_line = changed_line.strip() # 文中のスペースを除外 changed_line = changed_line.replace(" ", "") return changed_line
def encode_katakana(text): """I don't think this quite works yet.""" encoded = [] for char in text: if jaconv: # try to convert japanese text to half-katakanas char = jaconv.z2h(jaconv.hira2kata(char)) # TODO: "the conversion may result in multiple characters" # If that really can happen (I am not really shure), than the string would have to be split and every single # character has to passed through the following lines. if char in TXT_ENC_KATAKANA_MAP: encoded.append(TXT_ENC_KATAKANA_MAP[char]) else: # TODO doesn't this discard all that is not in the map? Can we be sure that the input does contain only # encodable characters? We could at least throw an exception if encoding is not possible. pass return b"".join(encoded)
def run(self, inp, outp): obj = ruamel.yaml.load(open(inp), ruamel.yaml.RoundTripLoader) output = '' for key in obj: if key == u'主要業績': continue output += u'##' + key + '\n\n' table = [] for subkey in obj[key]: val = obj[key][subkey] if type(subkey) == type(u'unicode'): subkey = jaconv.z2h(subkey) if key == u'名前' and subkey == 'ja': output = '# %s\n\n--------------\n\n%s' % (val, output) if not val or (type(val) == type('string') and not val.strip()): #DO empty table.append(['**%s**' % subkey, 'XXXXX']) elif type(val) == type(u'unicode') or type(val) == type('string'): #DO string table.append(['**%s**' % subkey, val]) else: #DO list table.append(['**%s**' % subkey, val[0]]) for i in range(1, len(val), 1): table.append(['', val[i]]) output += tabulate(table) + '\n\n' codecs.open('mytable.md', 'w+', 'utf-8').write(output) call(["pandoc", "-f", "markdown", "-t", "html5", "-s", "--css", "page.css", "-o", "mytable.html", "mytable.md"]) txt = open('mytable.html').read() txt = re.sub('style=\"[^\"]+\"', '', txt) txt = txt.replace('XXXXX', ' ') open('mytable.html', 'w+').write(txt) call(["wkhtmltopdf", "mytable.html", outp]) os.remove('mytable.html') os.remove('mytable.md')
def normalize(t): t = unicodedata.normalize('NFKC', t) t = jaconv.z2h(t, kana=False, digit=True, ascii=True) return t