Пример #1
0
    def convert_JapaneseYear_to_CommonEra(self, warekiYear):

        pattern = re.compile(
            '^(|.+)(明治|大正|昭和|平成|令和)(|\u3000| )([元0-90-9]+)年(|\u3000| )([0-90-9]+)月(|\u3000| )([0-90-9]+)日(|.+)$',
            re.MULTILINE)
        matches = pattern.search(warekiYear)

        era_name = matches.group(2)
        year = matches.group(4)
        month = jaconv.z2h(matches.group(6), digit=True)
        month = month.zfill(2)
        day = jaconv.z2h(matches.group(8), digit=True)
        day = day.zfill(2)

        if year == '元':
            year = 1
        else:
            if sys.version_info < (3, 0):
                year = year.decode('utf-8')
            year = int(jaconv.z2h(year, digit=True))

        if era_name == '明治':
            year += 1867
        elif era_name == '大正':
            year += 1911
        elif era_name == '昭和':
            year += 1925
        elif era_name == '平成':
            year += 1988
        elif era_name == '令和':
            year += 2018

        return str(year) + month + day
Пример #2
0
def class_info(tr):
    d = dict()
    yobi = ['月', '火', '水', '木', '金', '土', '日']
    season = ['春', '夏', '秋', '冬']
    for i, td in enumerate(tr.find_elements_by_tag_name('td')):
        try:
            if i == 0:
                d['year'] = int(td.text)
            elif i == 2:
                d['name'] = td.text
                d['key'] = td.find_element_by_tag_name('a').get_attribute(
                    'onclick').split("'")[3]
            elif i == 5:
                d['term'] = '0123' if td.text == '通年' else str(
                    season.index(td.text[0]))
                if td.text[1:] == '学期':
                    d['term'] += f'{int(d["term"]) + 1}'
            elif i == 6:
                d['weekday'] = int(yobi.index(td.text[0]))
                d['period'] = int(
                    jaconv.z2h(td.text[1], digit=True, ascii=True))
            elif i == 7:
                info = td.text.split('-')
                if info[0].isdecimal() and (info[1][:3].isdecimal()
                                            or info[1][0] == 'B'):
                    d['building'] = int(
                        jaconv.z2h(info[0], digit=True, ascii=True))
                    d['room'] = jaconv.z2h(info[1][:3], digit=True, ascii=True)
                else:
                    raise Exception
        except:
            # print('error on ', tr.text)
            return None
    return d
Пример #3
0
def text_ins_reg(ins):
    # 検索置換の開始。
    # ### 全角数字を半角に変換する。
    ins = jaconv.z2h(ins, kana=False, ascii=False, digit=True)
    # ### 数字の桁区切りが全角だった場合 => 半角に変換。
    ins = re.sub('^[ +]|[ +]$', '', ins)
    # ### こちらは『''』の前に『r』がなくてもグループ化と正規化がうまくいってる?
    ins = re.sub('(?<=\d),(?=\d+)', '\1,\2', ins)
    # ### 句点読点を統一。通常文章バージョン
    ins = ins.replace(',', '、')  # 理科系バージョン => '、', ','
    ins = ins.replace('.', '。')  # 理科系バージョン => '。', '.'
    # ### 全角ASCIIを半角に変換する。
    # 全角スペースを下駄に変換、全角ASCIIを半角に変換、下駄を全角スペースに戻す。
    ins = ins.replace(' ', '〓')
    ins = jaconv.z2h(ins, kana=False, ascii=True, digit=False)
    ins = ins.replace('〓', ' ')
    # ### 半角カタカナを全角に変換する。
    ins = jaconv.h2z(ins)
    # ### ASCIIの『()』『[]』を全角に変える。
    # ### 『''』の前に『r』を付けることについて、規則が全く理解できない!
    ins = re.sub('\((.+?)\)', r'(\1)', ins)
    ins = re.sub('\[(.+?)\]', r'[\1]', ins)
    # ### 時間表示の『:』を全角に変換する。
    ins = re.sub('(\d{1,2}):(\d{2})', r'\1:\2', ins)
    # ### 箇条書き先頭の数字周りの全角ピリオドをママ活かす。
    # ### ### 句点だった場合
    ins = re.sub('^(\d{1,3})。', r'\1.', ins, flags=re.MULTILINE)
    # ### ### Piriodの場合
    ins = re.sub('^(\d{1,3})\.\s', r'\1.', ins, flags=re.MULTILINE)
    # ### 問題点 文字列前後の不要なスペースを取り除けない。
    # ins = ins.strip()
    ins = re.sub('^\s+', r'', ins, flags=re.MULTILINE)
    # ##############################################
    return ins
Пример #4
0
def get_title():
    global titles
    global dates
    dates.clear()
    for title in titles:
        title = title.split('年')[1]
        month = title.split('月')[0]
        day = title.split('月')[1].split('日')[0]
        month = jaconv.z2h(month, digit=True, ascii=True)
        day = jaconv.z2h(day, digit=True, ascii=True)
        date = month + '/' + day
        dates.append(date)
Пример #5
0
def kanja_scraping():

    r = requests.get(
        "https://www.pref.yamanashi.jp/koucho/coronavirus/info_coronavirus_prevention.html"
    )
    r.raise_for_status()
    soup = BeautifulSoup(r.content, "html.parser")

    h2 = soup.find(get_title)

    data = []
    s = ""

    # 下向きに同レベルのタグを抽出

    for tag in h2.find_next_siblings():
        if tag.name == "h4":
            data.append(
                jaconv.z2h(s.rstrip(), kana=False, digit=True, ascii=True))
            s = ""
        elif tag.name == "h2":
            data.append(
                jaconv.z2h(s.rstrip(), kana=False, digit=True, ascii=True))
            break

        s += tag.get_text(strip=True) + "\n"

    result = []

    for d in data[1:]:

        # m = re.match("^.+$", d, re.MULTILINE)
        m = re.match(r"県内\d{1,3}例目", d)

        if m:

            temp = {"No": m.group(0)}

            for i in re.finditer(r"(発生判明日|年代|性別|居住地):(.+)$", d, re.MULTILINE):
                temp[i.group(1)] = i.group(2)

                if i.group(1) == "居住地":

                    t = copy.deepcopy(temp)

                    t["リリース日"] = wareki2date(t["発生判明日"]).isoformat()
                    del t["発生判明日"]
                    t["退院"] = None

                    result.append(t)
    return result[::-1]
Пример #6
0
def handleM4A(path):
    # ./GENRE/Compilations/ARTIST/ALBUM/SONG.m4a
    temp = path.replace("\\", "/")
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    song = temp[temp.rfind("/") + 1:]
    song = jaconv.z2h(song, kana=False, digit=True, ascii=True)
    song = jaconv.h2z(song, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    album = temp[temp.rfind("/") + 1:]
    album = jaconv.z2h(album, kana=False, digit=True, ascii=True)
    album = jaconv.h2z(album, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    artist = temp[temp.rfind("/") + 1:]
    artist = jaconv.z2h(artist, kana=False, digit=True, ascii=True)
    artist = jaconv.h2z(artist, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    genre = temp[temp.rfind("/") + 1:]
    genre = jaconv.z2h(genre, kana=False, digit=True, ascii=True)
    genre = jaconv.h2z(genre, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # take artist as Compilations
    category = temp[temp.rfind("/") + 1:]
    temp = temp[:temp.rfind("/")]
    if category == "__02_Compilations__":
        artist = "__Compilations__"
    elif category == "__01_Favorites__":
        pass
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    mp4 = MP4(path)
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    MyLogger.info(path)
    mp4.tags[TRACK_TITLE] = song
    mp4.tags[ALBUM] = album
    mp4.tags[ALBUM_ARTIST] = artist
    mp4.tags[ALBUM_SORT_ORDER] = conv.do(album)
    mp4.tags[ARTIST] = artist
    mp4.tags[ARTIST_SORT_ORDER] = conv.do(artist)
    mp4.tags[GENRE] = genre
    MyLogger.info("mp4.tags[TRACK_TITLE]", str(mp4.tags[TRACK_TITLE]))
    MyLogger.info("mp4.tags[ALBUM]", str(mp4.tags[ALBUM]))
    MyLogger.info("mp4.tags[ALBUM_ARTIST]", str(mp4.tags[ALBUM_ARTIST]))
    MyLogger.info("mp4.tags[ALBUM_SORT_ORDER]",
                  str(mp4.tags[ALBUM_SORT_ORDER]))
    MyLogger.info("mp4.tags[ARTIST]", str(mp4.tags[ARTIST]))
    MyLogger.info("mp4.tags[ARTIST_SORT_ORDER]",
                  str(mp4.tags[ARTIST_SORT_ORDER]))
    MyLogger.info("mp4.tags[GENRE]", str(mp4.tags[GENRE]))
Пример #7
0
def csv_reg(df_in):
    rows = []
    # valuesメソッドで1行ごとに文字列を整理していく。
    for row in df_in.values:
        for i, cell in enumerate(row):
            # ###########################################################
            # #####問題点                          #####
            # #####if条件式でcellの中身が空なら次に処理を促すコードにしたい。#####
            # ###########################################################
            if not isinstance(cell, str):
                if np.isnan(cell):
                    continue
            cell = str(cell)
            # 検索置換の開始。
            # ### 全角数字を半角に変換する。
            cell = jaconv.z2h(cell, kana=False, ascii=False, digit=True)
            # ### 数字の桁区切りが全角だった場合 => 半角に変換。
            # ### こちらは『''』の前に『r』がなくてもグループ化と正規化がうまくいってる?
            cell = re.sub('(?<=\d),(?=\d+)', '\1,\2', cell)
            # ### 句点読点を統一。通常文章バージョン
            cell = cell.replace(',', '、')  # 理科系バージョン => '、', ','
            cell = cell.replace('.', '。')  # 理科系バージョン => '。', '.'
            # ### 全角ASCIIを半角に変換する。
            # 全角スペースを■に変換、全角ASCIIを半角に変換、■を全角スペースに戻す。
            cell = cell.replace(' ', '■')
            cell = re.sub('[〜~]', '〓から〓', cell)
            cell = jaconv.z2h(cell, kana=False, ascii=True, digit=False)
            cell = cell.replace('■', ' ')
            cell = cell.replace('〓から〓', '〜')
            # ### 半角カタカナを全角に変換する。
            cell = jaconv.h2z(cell)
            # ### ASCIIの『()』『[]』を全角に変える。
            # ### 『''』の前に『r』を付けることについて、規則が全く理解できない!
            cell = re.sub('\((.+?)\)', r'(\1)', cell)
            cell = re.sub('\[(.+?)\]', r'[\1]', cell)
            # ### 時間表示の『:』を全角に変換する。
            cell = re.sub('(\d{1,2}):(\d{2})', r'\1:\2', cell)
            # ### 先頭の欧文スペースを取る
            cell = re.sub('^ ', '', cell)
            # ### 箇条書き先頭の数字周りの全角ピリオドをママ活かす。
            # ### ### 句点だった場合
            cell = re.sub('^(\d{1,3})。', r'\1.', cell, flags=re.MULTILINE)
            # ### ### Piriodの場合
            cell = re.sub('^(\d{1,3})\.\s', r'\1.', cell, flags=re.MULTILINE)
            # ### セル内改行、及び、文字列前後の不要なスペースを取り除く。
            row[i] = cell.replace('\n', '▽').strip()
        rows.append(row)
    # 元のヘッダーをここで設置しなおす。
    df = pd.DataFrame(rows, columns=df_in.columns)
    return df
Пример #8
0
 def get_patients_last_update(self) -> str:
     # patients_sheetから"M/D H時現在"の形式で記載されている最終更新日時を取得する
     # クラスターが増えれば端に寄っていき、固定値にすると取得できないので、whileで探索させている
     column_num = 16
     data_time_str = ""
     while not data_time_str:
         if not self.patients_sheet.cell(row=3, column=column_num).value:
             column_num += 1
             continue
         # 数字に全角半角が混じっていることがあるので、半角に統一
         data_time_str = jaconv.z2h(str(self.patients_sheet.cell(row=3, column=column_num).value), digit=True, ascii=True)
     plus_day = 0
     # datetime.strptimeでは24時は読み取れないため。24時を次の日の0時として扱わせる
     if data_time_str[-5:] == "24時現在":
         # 12/31や1/1など、文字数の増減に対応するため、whileで探索させている
         count = 8
         while True:
             try:
                 day_str, hour_str = data_time_str[-count:].split()
                 if day_str.startswith("/"):
                     raise
                 break
             except Exception:
                 count -= 1
         data_time_str = data_time_str[:-count] + day_str + " 0時現在"
         plus_day = 1
     # 最後に、頭に"2020/"を付け加えてdatetimeに読み取らせている
     # 2021年になった時などどうするかは未定
     # TODO: 年が変わった場合の対応
     last_update = datetime.strptime("2020/" + data_time_str, "%Y/%m/%d %H時現在") + timedelta(days=plus_day)
     return last_update.replace(tzinfo=jst).isoformat()
def zen2han(input):
    # 半角カナにない特殊文字をまず変換
    buf = []
    for x in input:
        if x in ('ゐ', 'ヰ'):
            y = '\u0010'
        elif x in ('ゑ', 'ヱ'):
            y = '\u0011'
        elif x == 'ヵ':
            y = '\u0012'
        elif x == 'ヶ':
            y = '\u0013'
        elif x in ('ゎ', 'ヮ'):
            y = '\u0014'
        else:
            y = x
        buf.append(y)
    output = "".join(buf)

    # 半角カタカナに変換
    output = jaconv.z2h(jaconv.hira2kata(output),
                        kana=True,
                        digit=True,
                        ascii=True)
    output = output.replace('゛', '゙')  # 全角濁点を半角濁点に
    output = output.replace('゜', '゚')  # 全角半濁点を半角半濁点に
    return output
Пример #10
0
def check_amount(recipesoup,key_number,check_ingredient):
    time.sleep(1)
    find_amount=recipesoup.find_all('div',class_="ingredient_quantity amount")
    def ingredient_amount(find_amount):
        list=[]
        for amount in find_amount:
            list.append(amount.string)
        return(list)
    ingredient_amount_list=ingredient_amount(find_amount)

    # 材料と量を合わせて辞書にする
    ingredient_dict={}
    for i,j in zip(ingredient_name_list,ingredient_amount_list):
        ingredient_dict[i]=j
        if bool(re.fullmatch(ingredient_dictlist[key_number-1]["name"],i))==True:
            # 量を材料リストから取り出して半角にしたものを格納する
            check_amount=jaconv.z2h(ingredient_dict[i],digit=True,ascii=True)
    if check_ingredient==0:
        check_amount=""

    # 量を材料リストから取り出して半角にしたものを格納する

    # 量に数字がないもの、()が含まれているものをNGとする
    if bool(re.findall(r"\d",check_amount))==False or bool(re.findall("~|\(|\)",check_amount))==True:
        judge_amount=0
    else:
        judge_amount=1
    # 材料名と量を組み合わせた辞書と指定された材料名と量のチェックの結果が返ってくる
    # 量のチェックの結果が1 → 採用
    return ingredient_dict,judge_amount
Пример #11
0
def getReceipt(sim_pred, resultMap, i):
    print('input receipt {}'.format(sim_pred))
    if sim_pred.find('年') > -1:
        return
    if priceUtils.checkMnyStr(sim_pred):
        return
    sim_pred = sim_pred.replace('ー', '-').replace('。', '.')
    sim_pred = jaconv.z2h(sim_pred, digit=True, ascii=True)
    if sim_pred.find('-') == -1:
        return
    # sim_pred = numberUtils.numberReplacement(sim_pred)
    if (sim_pred.find('-') == 0):
        sim_pred = str(1) + sim_pred
    tmpList = sim_pred.split('-')
    tmpHead = tmpList[0]
    tmpHead = tmpHead[-1]
    tmpHead = numberUtils.numberReplacement(tmpHead)

    lstHead = re.findall(r'\d+', tmpHead)
    tmpHead = ''.join(lstHead)
    if (tmpHead == ''):
        tmpHead = '1'
    tmpTail = tmpList[-1][:4]
    tmpTail = numberUtils.numberReplacement(tmpTail)
    lstTail = re.findall(r'\d+', tmpTail)
    tmpTail = ''.join(lstTail)
    if tmpTail == '':
        tmpTail = '1234'

    resultMap['6_receiptNO'] = tmpHead + 'ー' + tmpTail
    # resultMap['6_receiptNO']=tmpHead+'-'+tmpTail
    print('output receiptNO {}'.format(resultMap['6_receiptNO']))
Пример #12
0
def re_cellstr(str):
    # 文字列前後の空白を削除。
    str = str.strip()
    # セル内改行を取り除く。
    str = str.replace('\n', '▽')
    # 半角カタカナを全角に変換する。
    str = jaconv.h2z(str)

    # 全角のアスキーをASCIIへ変換(スペースもASCIIになる。)。
    # 『〜』カラ
    str = re.sub('[〜~]', '〓から〓', str)
    # 『()』カッコ
    str = re.sub('(()(.+?)())', r'〓Rカッコ〓\2〓Rカッコ〓', str)
    str = re.sub('([)(.+?)(])', r'〓Bカッコ〓\2〓Bカッコ〓', str)
    str = jaconv.z2h(str, kana=False, ascii=True, digit=False)
    # 『〜』カラ復号
    str = str.replace('〓から〓', '〜')
    # 『()』カッコ復号
    str = re.sub('(〓Rカッコ〓)(.+?)(〓Rカッコ〓)', r'(\2)', str)
    str = re.sub('(〓Bカッコ〓)(.+?)(〓Bカッコ〓)', r'(\2)', str)

    # スペース(複数含む)をスペース一つに変換。
    str = re.sub("\s+", " ", str)
    # コラムが右に1列増えるのを防ぐため。
    str = re.sub(",", "/", str)
    return str
Пример #13
0
 def eval_force_romaji_to_kana_v2(self, text, kana_ref, nbest=20):
     p_ = jaconv.z2h(text, digit=True, ascii=True, kana=False)  # hankaku
     p_ = jaconv.normalize(p_, "NFKC")
     p = jaconv.alphabet2kata(p_)  # romanize as possible
     if p_ == p:
         return 12345
     return self.eval_normal(p, kana_ref, nbest)
Пример #14
0
def replaceName(string):
    string = hira2kata(z2h(string, digit=True, kana=False))
    for tmp_string in trans_string_table:
        string = string.replace(tmp_string[0], tmp_string[1])
    string = string.translate(trans_table)
    string = re.sub(replace_string, "", string)
    return string
Пример #15
0
def getCategoryAfter(tmpResult,resultMap,i):
  tmpResult = jaconv.z2h(tmpResult, digit=True, ascii=True)
  # not minus ー
  if(tmpResult.find('責')>-1 \
     or tmpResult.find('No')>-1 \
     or tmpResult.find('点')>-1 \
     or tmpResult.find('×')>-1 \
     # or tmpResult.find('-')>-1 \
     or tmpResult.find('ー')>-1 \
     or tmpResult.find(':')>-1 \
     or tmpResult.find('NO')>-1):
      return
  if not priceUtils.checkMnyStr(tmpResult):
      return
  print('input_{} category {}'.format(i,tmpResult))
  tmpResult=numberUtils.getMny(tmpResult)
  if(tmpResult==''):
    return

  tmpResult=numberUtils.numberReplacement(tmpResult)
  lstCatPrice=re.findall(r'\d+', tmpResult)
  sCatPrice=''.join(lstCatPrice)
  if(sCatPrice==''):
    iCatPrice=0
  else:
    iCatPrice=int(sCatPrice)

  if tmpResult.find('-')>-1:
      iCatPrice*=-1
  if(iCatPrice !=0):
    resultMap['suffix_catPrice'].append(iCatPrice)
    print('output iCatPrice------------- {}'.format(iCatPrice))
Пример #16
0
def test_jaconv():
    logging.info("=========================================")
    logging.info("=               jaconv                  =")
    logging.info("=========================================")
    test_cases = get_test_cases()
    for tc in test_cases:
        title = tc['title']
        body = tc['body']

        logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title)
        calc_time(jaconv.hira2kata, body)
        logging.debug("result: %s" % jaconv.hira2hkata(body))

        logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title)
        calc_time(jaconv.kata2hira, body)
        logging.debug("result: %s" % jaconv.kata2hira(body))

        logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title)
        calc_time(jaconv.hira2hkata, body)
        logging.debug("result: %s" % jaconv.hira2hkata(body))

        logging.info("半角 to 全角 for %s" % title)
        calc_time(jaconv.h2z, body)
        logging.debug("result: %s" % jaconv.h2z(body))

        logging.info("全角 to 半角 for %s" % title)
        calc_time(jaconv.z2h, body)
        logging.debug("result: %s" % jaconv.z2h(body))
Пример #17
0
    def convert(self, sent):
        sent = jaconv.z2h(sent, kana=False, ascii=True, digit=True)
        iters = re.finditer(r'([a-zA-Z][a-zA-Z\s]*)$', sent)
        output_word = ""
        pos = 0
        for i in iters:
            s_pos, e_pos = i.span()
            word = i.groups()[0]
            word = re.sub('^\s', r'', word)
            word = re.sub('\s$', r'', word)
            s_word = ""

            while pos < s_pos:
                output_word += sent[pos]
                pos += 1

            if word in self.med_dic:
                s_word = self.med_dic[word]
            elif word.lower() in self.med_dic:
                s_word = self.med_dic[word.lower()]
            else:
                s_word = word

            if s_word == '':
                s_word = word

            output_word += s_word
            pos = e_pos

        while pos < len(sent):
            output_word += sent[pos]
            pos += 1

        return jaconv.h2z(output_word, kana=True, ascii=True, digit=True)
Пример #18
0
def test_z2h():
    assert_equal(jaconv.z2h('ティロフィナーレ'), 'ティロフィナーレ')
    assert_equal(jaconv.z2h('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ')
    _compare(partial(jaconv.z2h, kana=True), FULL_KANA, HALF_KANA)
    _compare(partial(jaconv.z2h, ascii=True), FULL_ASCII, HALF_ASCII)
    _compare(partial(jaconv.z2h, digit=True), FULL_DIGIT, HALF_DIGIT)

    for ascii in (True, False):
        for digit in (True, False):
            for kana in (True, False):
                assert_equal(
                    jaconv.z2h(_concat(FULL_KANA if kana else HALF_KANA,
                                        FULL_ASCII if ascii else HALF_ASCII,
                                        FULL_DIGIT if digit else HALF_DIGIT),
                                ascii=ascii, digit=digit, kana=kana),
                    _concat(HALF_KANA, HALF_ASCII, HALF_DIGIT))
Пример #19
0
def test_z2h():
    assert_equal(jaconv.z2h('ティロフィナーレ'), 'ティロフィナーレ')
    assert_equal(jaconv.z2h('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ')
    _compare(partial(jaconv.z2h, kana=True), FULL_KANA, HALF_KANA)
    _compare(partial(jaconv.z2h, ascii=True), FULL_ASCII, HALF_ASCII)
    _compare(partial(jaconv.z2h, digit=True), FULL_DIGIT, HALF_DIGIT)

    for ascii in (True, False):
        for digit in (True, False):
            for kana in (True, False):
                assert_equal(
                    jaconv.z2h(_concat(FULL_KANA if kana else HALF_KANA,
                                        FULL_ASCII if ascii else HALF_ASCII,
                                        FULL_DIGIT if digit else HALF_DIGIT),
                                ascii=ascii, digit=digit, kana=kana),
                    _concat(HALF_KANA, HALF_ASCII, HALF_DIGIT))
Пример #20
0
 def _extract_lemmatized_word(
     word_info,
     use_jaconv=False,
     pos_list=[],
     exclude_numbers=False,
     exclude_symbols=False
 ):
     word_info = word_info.split('\t')
     if len(word_info) > 1:
         word_details = word_info[1].split(',')
         if pos_list != [] and word_details[0] not in pos_list:
             return ''
         if exclude_numbers and word_details[1] == '数':
             return ''
         if exclude_symbols and word_details[0] == '記号':
             return ''
         if len(word_details) > 6 and word_details[6] != '*':
             word = word_details[6]
         else:
             word = word_info[0]
     else:
         word = word_info[0]
     if use_jaconv:
         word = jaconv.z2h(word, digit=True, ascii=True)
         word = jaconv.normalize(word)
     return word
Пример #21
0
 def _split_geocode(geo):
     r = re.match(r"(\d*)(.*)", geo.strip())
     if r is None:
         return "", geo.strip()  # code not found
     else:
         code = jaconv.z2h(r.group(1), digit=True)
         geoname = r.group(2)
         return code, geoname
Пример #22
0
def normalize_line(line):
    half = jaconv.z2h(jaconv.normalize(line),
                      kana=False,
                      digit=True,
                      ascii=True)
    stripped = half.strip()
    dashed = dash_re.sub('-', stripped)
    return ws_re.sub(' ', dashed)
Пример #23
0
def normalize_txt(text):
    return jaconv.h2z(jaconv.z2h(text.strip(),
                                 kana=False,
                                 digit=True,
                                 ascii=True),
                      kana=True,
                      digit=False,
                      ascii=False).lower()
Пример #24
0
def my_parser(s):

    dt_str = jaconv.z2h(s.strip(), kana=False, digit=True, ascii=True)

    y = dt_now.year
    m, d = map(int, re.findall(r"(\d{1,2})", dt_str))

    return pd.Timestamp(year=y, month=m, day=d)
Пример #25
0
def normalize_char_width(string: str) -> str:
    """Normalize character widths in string to a set standard.

    Converts all katakana to full-width, and converts all latin alphabet and
    numeric characters to half-width
    """
    out_str = jaconv.h2z(string, kana=True, ascii=False, digit=False)
    out_str = jaconv.z2h(out_str, kana=False, ascii=True, digit=True)
    return out_str
Пример #26
0
 def clean_text(self, content):
     content = jaconv.z2h(content, kana=False, digit=True, ascii=True)
     content = self.content_repatter1.sub("<URL>", content)
     content = self.content_repatter2.sub("<EMAIL>", content)
     content = self.content_repatter3.sub("<TEL>", content)
     content = self.content_repatter4.sub("<DATE>", content)
     content = self.content_repatter5.sub("<DATE>", content)
     content = self.content_repatter6.sub("<PRICE>", content)
     return content
Пример #27
0
 def _normalize_age(a):
     a = re.sub(r"\s+", "", a)
     a = a.replace("歳", "").replace("以上", "+")
     a = jaconv.z2h(a, digit=True)
     r = re.match(r"^(\d+)\-(\d+)$", a)
     if not r:
         return a
     else:
         return "%02d-%02d" % (int(r.group(1)), int(r.group(2)))
Пример #28
0
def _normalize_ja_text(text: str) -> str:
  """Converts full-width alphabet/digit characters to half-width characters.

  Args:
    text: Text to be transformed.

  Returns:
    Transformed text.
  """
  return jaconv.z2h(text, kana=False, ascii=True, digit=True)
Пример #29
0
def basic_preprocess(text):
    # convert digital number and latin to hangaku
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
    # convert kana to zengaku
    text = jaconv.h2z(text, kana=True, digit=False, ascii=False)
    # convert kata to hira
    text = jaconv.kata2hira(text)
    # lowercase
    text = text.lower()
    return text
Пример #30
0
def check_persons(recipesoup):
    # 人数が明記してあるタグをセット
    find_person=recipesoup.find('div',class_="content")
    persons=0
    for child in find_person:
        i=jaconv.z2h(child.string,digit=True,ascii=True)
        if child==find_person.find('span',class_="servings_for yield") and bool(re.search(r"\d",i))==True:
            persons=re.search(r"\d",i).group()
    # 何人前かを数字で返す
    # 人数を表すタグがある かつ 数字がある (0ではない)→ 採用
    return persons
Пример #31
0
def change_str(input_string):
    # カナは全角に変換
    changed_line = jaconv.h2z(input_string)
    # 数字、記号、アルファベットの全角を半角に変換
    changed_line = jaconv.z2h(changed_line, kana=False, digit=True, ascii=True)
    # 前後のスペースと改行コードを除外
    changed_line = changed_line.strip()
    # 文中のスペースを除外
    changed_line = changed_line.replace(" ", "")

    return changed_line
Пример #32
0
def encode_katakana(text):
    """I don't think this quite works yet."""
    encoded = []
    for char in text:
        if jaconv:
            # try to convert japanese text to half-katakanas
            char = jaconv.z2h(jaconv.hira2kata(char))
            # TODO: "the conversion may result in multiple characters"
            # If that really can happen (I am not really shure), than the string would have to be split and every single
            #  character has to passed through the following lines.

        if char in TXT_ENC_KATAKANA_MAP:
            encoded.append(TXT_ENC_KATAKANA_MAP[char])
        else:
            # TODO doesn't this discard all that is not in the map? Can we be sure that the input does contain only
            # encodable characters? We could at least throw an exception if encoding is not possible.
            pass
    return b"".join(encoded)
Пример #33
0
 def run(self, inp, outp):
     obj = ruamel.yaml.load(open(inp), ruamel.yaml.RoundTripLoader)
     output = ''
     for key in obj:
         if key == u'主要業績':
             continue
         output += u'##' + key + '\n\n'
         table = []
         for subkey in obj[key]:
             val = obj[key][subkey]
             if type(subkey) == type(u'unicode'):
                 subkey = jaconv.z2h(subkey)
             if key == u'名前' and subkey == 'ja':
                 output = '# %s\n\n--------------\n\n%s' % (val, output)
             if not val or (type(val) == type('string') and not val.strip()):
                 #DO empty
                 table.append(['**%s**' % subkey, 'XXXXX'])
             elif type(val) == type(u'unicode') or type(val) == type('string'):
                 #DO string
                 table.append(['**%s**' % subkey, val])
             else:
                 #DO list
                 table.append(['**%s**' % subkey, val[0]])
                 for i in range(1, len(val), 1):
                     table.append(['', val[i]])
         output += tabulate(table) + '\n\n'
     
     codecs.open('mytable.md', 'w+', 'utf-8').write(output)
     call(["pandoc", "-f", "markdown", "-t", "html5", "-s", "--css", "page.css", "-o", "mytable.html", "mytable.md"])
     txt = open('mytable.html').read()
     txt = re.sub('style=\"[^\"]+\"', '', txt)
     txt = txt.replace('XXXXX', '&nbsp;')
     open('mytable.html', 'w+').write(txt)
     call(["wkhtmltopdf", "mytable.html", outp])
     os.remove('mytable.html')
     os.remove('mytable.md')
Пример #34
0
def normalize(t):
    t = unicodedata.normalize('NFKC', t)
    t = jaconv.z2h(t, kana=False, digit=True, ascii=True)
    return t