Python z2h 예제들, jaconv.z2h Python 예제들

예제 #1

0

파일 보기

    def convert_JapaneseYear_to_CommonEra(self, warekiYear):

        pattern = re.compile(
            '^(|.+)(明治|大正|昭和|平成|令和)(|\u3000| )([元0-9０-９]+)年(|\u3000| )([0-9０-９]+)月(|\u3000| )([0-9０-９]+)日(|.+)$',
            re.MULTILINE)
        matches = pattern.search(warekiYear)

        era_name = matches.group(2)
        year = matches.group(4)
        month = jaconv.z2h(matches.group(6), digit=True)
        month = month.zfill(2)
        day = jaconv.z2h(matches.group(8), digit=True)
        day = day.zfill(2)

        if year == '元':
            year = 1
        else:
            if sys.version_info < (3, 0):
                year = year.decode('utf-8')
            year = int(jaconv.z2h(year, digit=True))

        if era_name == '明治':
            year += 1867
        elif era_name == '大正':
            year += 1911
        elif era_name == '昭和':
            year += 1925
        elif era_name == '平成':
            year += 1988
        elif era_name == '令和':
            year += 2018

        return str(year) + month + day

예제 #2

0

파일 보기

def class_info(tr):
    d = dict()
    yobi = ['月', '火', '水', '木', '金', '土', '日']
    season = ['春', '夏', '秋', '冬']
    for i, td in enumerate(tr.find_elements_by_tag_name('td')):
        try:
            if i == 0:
                d['year'] = int(td.text)
            elif i == 2:
                d['name'] = td.text
                d['key'] = td.find_element_by_tag_name('a').get_attribute(
                    'onclick').split("'")[3]
            elif i == 5:
                d['term'] = '0123' if td.text == '通年' else str(
                    season.index(td.text[0]))
                if td.text[1:] == '学期':
                    d['term'] += f'{int(d["term"]) + 1}'
            elif i == 6:
                d['weekday'] = int(yobi.index(td.text[0]))
                d['period'] = int(
                    jaconv.z2h(td.text[1], digit=True, ascii=True))
            elif i == 7:
                info = td.text.split('-')
                if info[0].isdecimal() and (info[1][:3].isdecimal()
                                            or info[1][0] == 'B'):
                    d['building'] = int(
                        jaconv.z2h(info[0], digit=True, ascii=True))
                    d['room'] = jaconv.z2h(info[1][:3], digit=True, ascii=True)
                else:
                    raise Exception
        except:
            # print('error on ', tr.text)
            return None
    return d

예제 #3

0

파일 보기

def text_ins_reg(ins):
    # 検索置換の開始。
    # ### 全角数字を半角に変換する。
    ins = jaconv.z2h(ins, kana=False, ascii=False, digit=True)
    # ### 数字の桁区切りが全角だった場合　=> 半角に変換。
    ins = re.sub('^[ +]|[ +]$', '', ins)
    # ### こちらは『''』の前に『r』がなくてもグループ化と正規化がうまくいってる？
    ins = re.sub('(?<=\d)，(?=\d+)', '\1,\2', ins)
    # ### 句点読点を統一。通常文章バージョン
    ins = ins.replace('，', '、')  # 理科系バージョン => '、', '，'
    ins = ins.replace('．', '。')  # 理科系バージョン => '。', '．'
    # ### 全角ASCIIを半角に変換する。
    # 全角スペースを下駄に変換、全角ASCIIを半角に変換、下駄を全角スペースに戻す。
    ins = ins.replace('　', '〓')
    ins = jaconv.z2h(ins, kana=False, ascii=True, digit=False)
    ins = ins.replace('〓', '　')
    # ### 半角カタカナを全角に変換する。
    ins = jaconv.h2z(ins)
    # ### ASCIIの『()』『[]』を全角に変える。
    # ### 『''』の前に『r』を付けることについて、規則が全く理解できない！
    ins = re.sub('\((.+?)\)', r'（\1）', ins)
    ins = re.sub('\[(.+?)\]', r'［\1］', ins)
    # ### 時間表示の『：』を全角に変換する。
    ins = re.sub('(\d{1,2}):(\d{2})', r'\1：\2', ins)
    # ### 箇条書き先頭の数字周りの全角ピリオドをママ活かす。
    # ### ### 句点だった場合
    ins = re.sub('^(\d{1,3})。', r'\1．', ins, flags=re.MULTILINE)
    # ### ### Piriodの場合
    ins = re.sub('^(\d{1,3})\.\s', r'\1．', ins, flags=re.MULTILINE)
    # ### 問題点　文字列前後の不要なスペースを取り除けない。
    # ins = ins.strip()
    ins = re.sub('^\s+', r'', ins, flags=re.MULTILINE)
    # ##############################################
    return ins

예제 #4

0

파일 보기

파일: views.py 프로젝트: satogen/corona_info_app

def get_title():
    global titles
    global dates
    dates.clear()
    for title in titles:
        title = title.split('年')[1]
        month = title.split('月')[0]
        day = title.split('月')[1].split('日')[0]
        month = jaconv.z2h(month, digit=True, ascii=True)
        day = jaconv.z2h(day, digit=True, ascii=True)
        date = month + '/' + day
        dates.append(date)

예제 #5

0

파일 보기

def kanja_scraping():

    r = requests.get(
        "https://www.pref.yamanashi.jp/koucho/coronavirus/info_coronavirus_prevention.html"
    )
    r.raise_for_status()
    soup = BeautifulSoup(r.content, "html.parser")

    h2 = soup.find(get_title)

    data = []
    s = ""

    # 下向きに同レベルのタグを抽出

    for tag in h2.find_next_siblings():
        if tag.name == "h4":
            data.append(
                jaconv.z2h(s.rstrip(), kana=False, digit=True, ascii=True))
            s = ""
        elif tag.name == "h2":
            data.append(
                jaconv.z2h(s.rstrip(), kana=False, digit=True, ascii=True))
            break

        s += tag.get_text(strip=True) + "\n"

    result = []

    for d in data[1:]:

        # m = re.match("^.+$", d, re.MULTILINE)
        m = re.match(r"県内\d{1,3}例目", d)

        if m:

            temp = {"No": m.group(0)}

            for i in re.finditer(r"(発生判明日|年代|性別|居住地):(.+)$", d, re.MULTILINE):
                temp[i.group(1)] = i.group(2)

                if i.group(1) == "居住地":

                    t = copy.deepcopy(temp)

                    t["リリース日"] = wareki2date(t["発生判明日"]).isoformat()
                    del t["発生判明日"]
                    t["退院"] = None

                    result.append(t)
    return result[::-1]

예제 #6

0

파일 보기

파일: MyMusicCenter.py 프로젝트: sakuradish/MyLibraries

def handleM4A(path):
    # ./GENRE/Compilations/ARTIST/ALBUM/SONG.m4a
    temp = path.replace("\\", "/")
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    song = temp[temp.rfind("/") + 1:]
    song = jaconv.z2h(song, kana=False, digit=True, ascii=True)
    song = jaconv.h2z(song, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    album = temp[temp.rfind("/") + 1:]
    album = jaconv.z2h(album, kana=False, digit=True, ascii=True)
    album = jaconv.h2z(album, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    artist = temp[temp.rfind("/") + 1:]
    artist = jaconv.z2h(artist, kana=False, digit=True, ascii=True)
    artist = jaconv.h2z(artist, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    genre = temp[temp.rfind("/") + 1:]
    genre = jaconv.z2h(genre, kana=False, digit=True, ascii=True)
    genre = jaconv.h2z(genre, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # take artist as Compilations
    category = temp[temp.rfind("/") + 1:]
    temp = temp[:temp.rfind("/")]
    if category == "__02_Compilations__":
        artist = "__Compilations__"
    elif category == "__01_Favorites__":
        pass
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    mp4 = MP4(path)
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    MyLogger.info(path)
    mp4.tags[TRACK_TITLE] = song
    mp4.tags[ALBUM] = album
    mp4.tags[ALBUM_ARTIST] = artist
    mp4.tags[ALBUM_SORT_ORDER] = conv.do(album)
    mp4.tags[ARTIST] = artist
    mp4.tags[ARTIST_SORT_ORDER] = conv.do(artist)
    mp4.tags[GENRE] = genre
    MyLogger.info("mp4.tags[TRACK_TITLE]", str(mp4.tags[TRACK_TITLE]))
    MyLogger.info("mp4.tags[ALBUM]", str(mp4.tags[ALBUM]))
    MyLogger.info("mp4.tags[ALBUM_ARTIST]", str(mp4.tags[ALBUM_ARTIST]))
    MyLogger.info("mp4.tags[ALBUM_SORT_ORDER]",
                  str(mp4.tags[ALBUM_SORT_ORDER]))
    MyLogger.info("mp4.tags[ARTIST]", str(mp4.tags[ARTIST]))
    MyLogger.info("mp4.tags[ARTIST_SORT_ORDER]",
                  str(mp4.tags[ARTIST_SORT_ORDER]))
    MyLogger.info("mp4.tags[GENRE]", str(mp4.tags[GENRE]))

예제 #7

0

파일 보기

def csv_reg(df_in):
    rows = []
    # valuesメソッドで1行ごとに文字列を整理していく。
    for row in df_in.values:
        for i, cell in enumerate(row):
            # ###########################################################
            # #####問題点　　　　　　　　　　　　　　　　　　　　　　　　　　#####
            # #####if条件式でcellの中身が空なら次に処理を促すコードにしたい。#####
            # ###########################################################
            if not isinstance(cell, str):
                if np.isnan(cell):
                    continue
            cell = str(cell)
            # 検索置換の開始。
            # ### 全角数字を半角に変換する。
            cell = jaconv.z2h(cell, kana=False, ascii=False, digit=True)
            # ### 数字の桁区切りが全角だった場合　=> 半角に変換。
            # ### こちらは『''』の前に『r』がなくてもグループ化と正規化がうまくいってる？
            cell = re.sub('(?<=\d)，(?=\d+)', '\1,\2', cell)
            # ### 句点読点を統一。通常文章バージョン
            cell = cell.replace('，', '、')  # 理科系バージョン => '、', '，'
            cell = cell.replace('．', '。')  # 理科系バージョン => '。', '．'
            # ### 全角ASCIIを半角に変換する。
            # 全角スペースを■に変換、全角ASCIIを半角に変換、■を全角スペースに戻す。
            cell = cell.replace('　', '■')
            cell = re.sub('[〜～]', '〓から〓', cell)
            cell = jaconv.z2h(cell, kana=False, ascii=True, digit=False)
            cell = cell.replace('■', '　')
            cell = cell.replace('〓から〓', '〜')
            # ### 半角カタカナを全角に変換する。
            cell = jaconv.h2z(cell)
            # ### ASCIIの『()』『[]』を全角に変える。
            # ### 『''』の前に『r』を付けることについて、規則が全く理解できない！
            cell = re.sub('\((.+?)\)', r'（\1）', cell)
            cell = re.sub('\[(.+?)\]', r'［\1］', cell)
            # ### 時間表示の『：』を全角に変換する。
            cell = re.sub('(\d{1,2}):(\d{2})', r'\1：\2', cell)
            # ### 先頭の欧文スペースを取る
            cell = re.sub('^ ', '', cell)
            # ### 箇条書き先頭の数字周りの全角ピリオドをママ活かす。
            # ### ### 句点だった場合
            cell = re.sub('^(\d{1,3})。', r'\1．', cell, flags=re.MULTILINE)
            # ### ### Piriodの場合
            cell = re.sub('^(\d{1,3})\.\s', r'\1．', cell, flags=re.MULTILINE)
            # ### セル内改行、及び、文字列前後の不要なスペースを取り除く。
            row[i] = cell.replace('\n', '▽').strip()
        rows.append(row)
    # 元のヘッダーをここで設置しなおす。
    df = pd.DataFrame(rows, columns=df_in.columns)
    return df

예제 #8

0

파일 보기

파일: main.py 프로젝트: omuron/covid19-scraping

 def get_patients_last_update(self) -> str:
     # patients_sheetから"M/D H時現在"の形式で記載されている最終更新日時を取得する
     # クラスターが増えれば端に寄っていき、固定値にすると取得できないので、whileで探索させている
     column_num = 16
     data_time_str = ""
     while not data_time_str:
         if not self.patients_sheet.cell(row=3, column=column_num).value:
             column_num += 1
             continue
         # 数字に全角半角が混じっていることがあるので、半角に統一
         data_time_str = jaconv.z2h(str(self.patients_sheet.cell(row=3, column=column_num).value), digit=True, ascii=True)
     plus_day = 0
     # datetime.strptimeでは24時は読み取れないため。24時を次の日の0時として扱わせる
     if data_time_str[-5:] == "24時現在":
         # 12/31や1/1など、文字数の増減に対応するため、whileで探索させている
         count = 8
         while True:
             try:
                 day_str, hour_str = data_time_str[-count:].split()
                 if day_str.startswith("/"):
                     raise
                 break
             except Exception:
                 count -= 1
         data_time_str = data_time_str[:-count] + day_str + " 0時現在"
         plus_day = 1
     # 最後に、頭に"2020/"を付け加えてdatetimeに読み取らせている
     # 2021年になった時などどうするかは未定
     # TODO: 年が変わった場合の対応
     last_update = datetime.strptime("2020/" + data_time_str, "%Y/%m/%d %H時現在") + timedelta(days=plus_day)
     return last_update.replace(tzinfo=jst).isoformat()

예제 #9

0

파일 보기

파일: atok_romaji_table.py 프로젝트: entosen/atok-romaji-table-util

def zen2han(input):
    # 半角カナにない特殊文字をまず変換
    buf = []
    for x in input:
        if x in ('ゐ', 'ヰ'):
            y = '\u0010'
        elif x in ('ゑ', 'ヱ'):
            y = '\u0011'
        elif x == 'ヵ':
            y = '\u0012'
        elif x == 'ヶ':
            y = '\u0013'
        elif x in ('ゎ', 'ヮ'):
            y = '\u0014'
        else:
            y = x
        buf.append(y)
    output = "".join(buf)

    # 半角カタカナに変換
    output = jaconv.z2h(jaconv.hira2kata(output),
                        kana=True,
                        digit=True,
                        ascii=True)
    output = output.replace('゛', 'ﾞ')  # 全角濁点を半角濁点に
    output = output.replace('゜', 'ﾟ')  # 全角半濁点を半角半濁点に
    return output

예제 #10

0

파일 보기

파일: cookpad_pick.py 프로젝트: cyakeeee/cookpad

def check_amount(recipesoup,key_number,check_ingredient):
    time.sleep(1)
    find_amount=recipesoup.find_all('div',class_="ingredient_quantity amount")
    def ingredient_amount(find_amount):
        list=[]
        for amount in find_amount:
            list.append(amount.string)
        return(list)
    ingredient_amount_list=ingredient_amount(find_amount)

    # 材料と量を合わせて辞書にする
    ingredient_dict={}
    for i,j in zip(ingredient_name_list,ingredient_amount_list):
        ingredient_dict[i]=j
        if bool(re.fullmatch(ingredient_dictlist[key_number-1]["name"],i))==True:
            # 量を材料リストから取り出して半角にしたものを格納する
            check_amount=jaconv.z2h(ingredient_dict[i],digit=True,ascii=True)
    if check_ingredient==0:
        check_amount=""

    # 量を材料リストから取り出して半角にしたものを格納する

    # 量に数字がないもの、()が含まれているものをNGとする
    if bool(re.findall(r"\d",check_amount))==False or bool(re.findall("~|\(|\)",check_amount))==True:
        judge_amount=0
    else:
        judge_amount=1
    # 材料名と量を組み合わせた辞書と指定された材料名と量のチェックの結果が返ってくる
    # 量のチェックの結果が1　→　採用
    return ingredient_dict,judge_amount

예제 #11

0

파일 보기

파일: staffNoUtils.py 프로젝트: jaassoon/chinese-ocr

def getReceipt(sim_pred, resultMap, i):
    print('input receipt {}'.format(sim_pred))
    if sim_pred.find('年') > -1:
        return
    if priceUtils.checkMnyStr(sim_pred):
        return
    sim_pred = sim_pred.replace('ｰ', '-').replace('。', '.')
    sim_pred = jaconv.z2h(sim_pred, digit=True, ascii=True)
    if sim_pred.find('-') == -1:
        return
    # sim_pred = numberUtils.numberReplacement(sim_pred)
    if (sim_pred.find('-') == 0):
        sim_pred = str(1) + sim_pred
    tmpList = sim_pred.split('-')
    tmpHead = tmpList[0]
    tmpHead = tmpHead[-1]
    tmpHead = numberUtils.numberReplacement(tmpHead)

    lstHead = re.findall(r'\d+', tmpHead)
    tmpHead = ''.join(lstHead)
    if (tmpHead == ''):
        tmpHead = '1'
    tmpTail = tmpList[-1][:4]
    tmpTail = numberUtils.numberReplacement(tmpTail)
    lstTail = re.findall(r'\d+', tmpTail)
    tmpTail = ''.join(lstTail)
    if tmpTail == '':
        tmpTail = '1234'

    resultMap['6_receiptNO'] = tmpHead + 'ｰ' + tmpTail
    # resultMap['6_receiptNO']=tmpHead+'-'+tmpTail
    print('output receiptNO {}'.format(resultMap['6_receiptNO']))

예제 #12

0

파일 보기

def re_cellstr(str):
    # 文字列前後の空白を削除。
    str = str.strip()
    # セル内改行を取り除く。
    str = str.replace('\n', '▽')
    # 半角カタカナを全角に変換する。
    str = jaconv.h2z(str)

    # 全角のアスキーをASCIIへ変換（スペースもASCIIになる。）。
    # 『〜』カラ
    str = re.sub('[〜～]', '〓から〓', str)
    # 『（）』カッコ
    str = re.sub('(（)(.+?)(）)', r'〓Rカッコ〓\2〓Rカッコ〓', str)
    str = re.sub('(［)(.+?)(］)', r'〓Bカッコ〓\2〓Bカッコ〓', str)
    str = jaconv.z2h(str, kana=False, ascii=True, digit=False)
    # 『〜』カラ復号
    str = str.replace('〓から〓', '〜')
    # 『（）』カッコ復号
    str = re.sub('(〓Rカッコ〓)(.+?)(〓Rカッコ〓)', r'（\2）', str)
    str = re.sub('(〓Bカッコ〓)(.+?)(〓Bカッコ〓)', r'（\2）', str)

    # スペース（複数含む）をスペース一つに変換。
    str = re.sub("\s+", " ", str)
    # コラムが右に1列増えるのを防ぐため。
    str = re.sub(",", "/", str)
    return str

예제 #13

0

파일 보기

 def eval_force_romaji_to_kana_v2(self, text, kana_ref, nbest=20):
     p_ = jaconv.z2h(text, digit=True, ascii=True, kana=False)  # hankaku
     p_ = jaconv.normalize(p_, "NFKC")
     p = jaconv.alphabet2kata(p_)  # romanize as possible
     if p_ == p:
         return 12345
     return self.eval_normal(p, kana_ref, nbest)

예제 #14

0

파일 보기

파일: ReplaceName.py 프로젝트: ss6987/yugioh_DB

def replaceName(string):
    string = hira2kata(z2h(string, digit=True, kana=False))
    for tmp_string in trans_string_table:
        string = string.replace(tmp_string[0], tmp_string[1])
    string = string.translate(trans_table)
    string = re.sub(replace_string, "", string)
    return string

예제 #15

0

파일 보기

def getCategoryAfter(tmpResult,resultMap,i):
  tmpResult = jaconv.z2h(tmpResult, digit=True, ascii=True)
  # not minus ｰ
  if(tmpResult.find('責')>-1 \
     or tmpResult.find('No')>-1 \
     or tmpResult.find('点')>-1 \
     or tmpResult.find('×')>-1 \
     # or tmpResult.find('-')>-1 \
     or tmpResult.find('ｰ')>-1 \
     or tmpResult.find(':')>-1 \
     or tmpResult.find('NO')>-1):
      return
  if not priceUtils.checkMnyStr(tmpResult):
      return
  print('input_{} category {}'.format(i,tmpResult))
  tmpResult=numberUtils.getMny(tmpResult)
  if(tmpResult==''):
    return

  tmpResult=numberUtils.numberReplacement(tmpResult)
  lstCatPrice=re.findall(r'\d+', tmpResult)
  sCatPrice=''.join(lstCatPrice)
  if(sCatPrice==''):
    iCatPrice=0
  else:
    iCatPrice=int(sCatPrice)

  if tmpResult.find('-')>-1:
      iCatPrice*=-1
  if(iCatPrice !=0):
    resultMap['suffix_catPrice'].append(iCatPrice)
    print('output iCatPrice------------- {}'.format(iCatPrice))

예제 #16

0

파일 보기

파일: tests.py 프로젝트: chase0213/python-jp-comparison

def test_jaconv():
    logging.info("=========================================")
    logging.info("=               jaconv                  =")
    logging.info("=========================================")
    test_cases = get_test_cases()
    for tc in test_cases:
        title = tc['title']
        body = tc['body']

        logging.info("ひらがな（全角） to カタカナ（全角） for %s" % title)
        calc_time(jaconv.hira2kata, body)
        logging.debug("result: %s" % jaconv.hira2hkata(body))

        logging.info("カタカナ（全角） to ひらがな（全角） for %s" % title)
        calc_time(jaconv.kata2hira, body)
        logging.debug("result: %s" % jaconv.kata2hira(body))

        logging.info("ひらがな（全角） to カタカナ（半角） for %s" % title)
        calc_time(jaconv.hira2hkata, body)
        logging.debug("result: %s" % jaconv.hira2hkata(body))

        logging.info("半角 to 全角 for %s" % title)
        calc_time(jaconv.h2z, body)
        logging.debug("result: %s" % jaconv.h2z(body))

        logging.info("全角 to 半角 for %s" % title)
        calc_time(jaconv.z2h, body)
        logging.debug("result: %s" % jaconv.z2h(body))

예제 #17

0

파일 보기

    def convert(self, sent):
        sent = jaconv.z2h(sent, kana=False, ascii=True, digit=True)
        iters = re.finditer(r'([a-zA-Z][a-zA-Z\s]*)$', sent)
        output_word = ""
        pos = 0
        for i in iters:
            s_pos, e_pos = i.span()
            word = i.groups()[0]
            word = re.sub('^\s', r'', word)
            word = re.sub('\s$', r'', word)
            s_word = ""

            while pos < s_pos:
                output_word += sent[pos]
                pos += 1

            if word in self.med_dic:
                s_word = self.med_dic[word]
            elif word.lower() in self.med_dic:
                s_word = self.med_dic[word.lower()]
            else:
                s_word = word

            if s_word == '':
                s_word = word

            output_word += s_word
            pos = e_pos

        while pos < len(sent):
            output_word += sent[pos]
            pos += 1

        return jaconv.h2z(output_word, kana=True, ascii=True, digit=True)

예제 #18

0

파일 보기

파일: test_jaconv.py 프로젝트: ikegami-yukino/jaconv

def test_z2h():
    assert_equal(jaconv.z2h('ティロフィナーレ'), 'ﾃｨﾛﾌｨﾅｰﾚ')
    assert_equal(jaconv.z2h('ティロフィナーレ', ignore='ィ'), 'ﾃィﾛﾌィﾅｰﾚ')
    _compare(partial(jaconv.z2h, kana=True), FULL_KANA, HALF_KANA)
    _compare(partial(jaconv.z2h, ascii=True), FULL_ASCII, HALF_ASCII)
    _compare(partial(jaconv.z2h, digit=True), FULL_DIGIT, HALF_DIGIT)

    for ascii in (True, False):
        for digit in (True, False):
            for kana in (True, False):
                assert_equal(
                    jaconv.z2h(_concat(FULL_KANA if kana else HALF_KANA,
                                        FULL_ASCII if ascii else HALF_ASCII,
                                        FULL_DIGIT if digit else HALF_DIGIT),
                                ascii=ascii, digit=digit, kana=kana),
                    _concat(HALF_KANA, HALF_ASCII, HALF_DIGIT))

예제 #19

0

파일 보기

def test_z2h():
    assert_equal(jaconv.z2h('ティロフィナーレ'), 'ﾃｨﾛﾌｨﾅｰﾚ')
    assert_equal(jaconv.z2h('ティロフィナーレ', ignore='ィ'), 'ﾃィﾛﾌィﾅｰﾚ')
    _compare(partial(jaconv.z2h, kana=True), FULL_KANA, HALF_KANA)
    _compare(partial(jaconv.z2h, ascii=True), FULL_ASCII, HALF_ASCII)
    _compare(partial(jaconv.z2h, digit=True), FULL_DIGIT, HALF_DIGIT)

    for ascii in (True, False):
        for digit in (True, False):
            for kana in (True, False):
                assert_equal(
                    jaconv.z2h(_concat(FULL_KANA if kana else HALF_KANA,
                                        FULL_ASCII if ascii else HALF_ASCII,
                                        FULL_DIGIT if digit else HALF_DIGIT),
                                ascii=ascii, digit=digit, kana=kana),
                    _concat(HALF_KANA, HALF_ASCII, HALF_DIGIT))

예제 #20

0

파일 보기

파일: nlp_utils.py 프로젝트: chie8842/mldatautils

 def _extract_lemmatized_word(
     word_info,
     use_jaconv=False,
     pos_list=[],
     exclude_numbers=False,
     exclude_symbols=False
 ):
     word_info = word_info.split('\t')
     if len(word_info) > 1:
         word_details = word_info[1].split(',')
         if pos_list != [] and word_details[0] not in pos_list:
             return ''
         if exclude_numbers and word_details[1] == '数':
             return ''
         if exclude_symbols and word_details[0] == '記号':
             return ''
         if len(word_details) > 6 and word_details[6] != '*':
             word = word_details[6]
         else:
             word = word_info[0]
     else:
         word = word_info[0]
     if use_jaconv:
         word = jaconv.z2h(word, digit=True, ascii=True)
         word = jaconv.normalize(word)
     return word

예제 #21

0

파일 보기

 def _split_geocode(geo):
     r = re.match(r"(\d*)(.*)", geo.strip())
     if r is None:
         return "", geo.strip()  # code not found
     else:
         code = jaconv.z2h(r.group(1), digit=True)
         geoname = r.group(2)
         return code, geoname

예제 #22

0

파일 보기

def normalize_line(line):
    half = jaconv.z2h(jaconv.normalize(line),
                      kana=False,
                      digit=True,
                      ascii=True)
    stripped = half.strip()
    dashed = dash_re.sub('-', stripped)
    return ws_re.sub(' ', dashed)

예제 #23

0

파일 보기

def normalize_txt(text):
    return jaconv.h2z(jaconv.z2h(text.strip(),
                                 kana=False,
                                 digit=True,
                                 ascii=True),
                      kana=True,
                      digit=False,
                      ascii=False).lower()

예제 #24

0

파일 보기

def my_parser(s):

    dt_str = jaconv.z2h(s.strip(), kana=False, digit=True, ascii=True)

    y = dt_now.year
    m, d = map(int, re.findall(r"(\d{1,2})", dt_str))

    return pd.Timestamp(year=y, month=m, day=d)

예제 #25

0

파일 보기

파일: __init__.py 프로젝트: FriedRice/Myaku

def normalize_char_width(string: str) -> str:
    """Normalize character widths in string to a set standard.

    Converts all katakana to full-width, and converts all latin alphabet and
    numeric characters to half-width
    """
    out_str = jaconv.h2z(string, kana=True, ascii=False, digit=False)
    out_str = jaconv.z2h(out_str, kana=False, ascii=True, digit=True)
    return out_str

예제 #26

0

파일 보기

 def clean_text(self, content):
     content = jaconv.z2h(content, kana=False, digit=True, ascii=True)
     content = self.content_repatter1.sub("<URL>", content)
     content = self.content_repatter2.sub("<EMAIL>", content)
     content = self.content_repatter3.sub("<TEL>", content)
     content = self.content_repatter4.sub("<DATE>", content)
     content = self.content_repatter5.sub("<DATE>", content)
     content = self.content_repatter6.sub("<PRICE>", content)
     return content

예제 #27

0

파일 보기

 def _normalize_age(a):
     a = re.sub(r"\s+", "", a)
     a = a.replace("歳", "").replace("以上", "+")
     a = jaconv.z2h(a, digit=True)
     r = re.match(r"^(\d+)\-(\d+)$", a)
     if not r:
         return a
     else:
         return "%02d-%02d" % (int(r.group(1)), int(r.group(2)))

예제 #28

0

파일 보기

def _normalize_ja_text(text: str) -> str:
  """Converts full-width alphabet/digit characters to half-width characters.

  Args:
    text: Text to be transformed.

  Returns:
    Transformed text.
  """
  return jaconv.z2h(text, kana=False, ascii=True, digit=True)

예제 #29

0

파일 보기

def basic_preprocess(text):
    # convert digital number and latin to hangaku
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
    # convert kana to zengaku
    text = jaconv.h2z(text, kana=True, digit=False, ascii=False)
    # convert kata to hira
    text = jaconv.kata2hira(text)
    # lowercase
    text = text.lower()
    return text

예제 #30

0

파일 보기

파일: cookpad_pick.py 프로젝트: cyakeeee/cookpad

def check_persons(recipesoup):
    # 人数が明記してあるタグをセット
    find_person=recipesoup.find('div',class_="content")
    persons=0
    for child in find_person:
        i=jaconv.z2h(child.string,digit=True,ascii=True)
        if child==find_person.find('span',class_="servings_for yield") and bool(re.search(r"\d",i))==True:
            persons=re.search(r"\d",i).group()
    # 何人前かを数字で返す
    # 人数を表すタグがある　かつ　数字がある　(0ではない)→　採用
    return persons

예제 #31

0

파일 보기

def change_str(input_string):
    # カナは全角に変換
    changed_line = jaconv.h2z(input_string)
    # 数字、記号、アルファベットの全角を半角に変換
    changed_line = jaconv.z2h(changed_line, kana=False, digit=True, ascii=True)
    # 前後のスペースと改行コードを除外
    changed_line = changed_line.strip()
    # 文中のスペースを除外
    changed_line = changed_line.replace(" ", "")

    return changed_line

예제 #32

0

파일 보기

파일: katakana.py 프로젝트: braveheuel/python-escpos

def encode_katakana(text):
    """I don't think this quite works yet."""
    encoded = []
    for char in text:
        if jaconv:
            # try to convert japanese text to half-katakanas
            char = jaconv.z2h(jaconv.hira2kata(char))
            # TODO: "the conversion may result in multiple characters"
            # If that really can happen (I am not really shure), than the string would have to be split and every single
            #  character has to passed through the following lines.

        if char in TXT_ENC_KATAKANA_MAP:
            encoded.append(TXT_ENC_KATAKANA_MAP[char])
        else:
            # TODO doesn't this discard all that is not in the map? Can we be sure that the input does contain only
            # encodable characters? We could at least throw an exception if encoding is not possible.
            pass
    return b"".join(encoded)

예제 #33

0

파일 보기

파일: makePDF.py 프로젝트: gsl-nagoya-u/gsl-sphinx

 def run(self, inp, outp):
     obj = ruamel.yaml.load(open(inp), ruamel.yaml.RoundTripLoader)
     output = ''
     for key in obj:
         if key == u'主要業績':
             continue
         output += u'##' + key + '\n\n'
         table = []
         for subkey in obj[key]:
             val = obj[key][subkey]
             if type(subkey) == type(u'unicode'):
                 subkey = jaconv.z2h(subkey)
             if key == u'名前' and subkey == 'ja':
                 output = '# %s\n\n--------------\n\n%s' % (val, output)
             if not val or (type(val) == type('string') and not val.strip()):
                 #DO empty
                 table.append(['**%s**' % subkey, 'XXXXX'])
             elif type(val) == type(u'unicode') or type(val) == type('string'):
                 #DO string
                 table.append(['**%s**' % subkey, val])
             else:
                 #DO list
                 table.append(['**%s**' % subkey, val[0]])
                 for i in range(1, len(val), 1):
                     table.append(['', val[i]])
         output += tabulate(table) + '\n\n'
     
     codecs.open('mytable.md', 'w+', 'utf-8').write(output)
     call(["pandoc", "-f", "markdown", "-t", "html5", "-s", "--css", "page.css", "-o", "mytable.html", "mytable.md"])
     txt = open('mytable.html').read()
     txt = re.sub('style=\"[^\"]+\"', '', txt)
     txt = txt.replace('XXXXX', '&nbsp;')
     open('mytable.html', 'w+').write(txt)
     call(["wkhtmltopdf", "mytable.html", outp])
     os.remove('mytable.html')
     os.remove('mytable.md')

예제 #34

0

파일 보기

파일: preprocess.py 프로젝트: krdlab/examples

def normalize(t):
    t = unicodedata.normalize('NFKC', t)
    t = jaconv.z2h(t, kana=False, digit=True, ascii=True)
    return t