def run(generator, normalize_space=False): for line in generator: line = line.strip('\n') if normalize_space: linez = zenhan.h2z(line, mode=7) else: linez = zenhan.h2z(line, mode=7, ignore=' ') print(linez)
def normalize(ingredient): ingredient = ingredient.strip() for SURROUND in SURROUNDS: ingredient = SURROUND.sub(lambda s: '', ingredient) ingredient = OPTIONAL_START.sub(lambda s: '', ingredient) match = UNCLOSED_PAREN.match(ingredient) if match: ingredient = match.groups()[0] ingredient = zenhan.z2h(ingredient, mode=1) # ascii ingredient = zenhan.h2z(ingredient, mode=4) # kana # convert all katakana to hiragana ingredient = hiragana(ingredient) match = STARTS_WITH_ALPHA.match(ingredient) if match and not ingredient.startswith('S&B'): ingredient = match.groups()[0] for SPECIAL_SYMBOL in SPECIAL_SYMBOLS: ingredient = SPECIAL_SYMBOL.sub(lambda s: '', ingredient) ingredients = SPLIT.split(ingredient) ingredients = map(lambda ingr: ENDS_WITH.sub(lambda s: '', ingr), ingredients) ingredients = map(lambda ingr: ingr.strip(), ingredients) ingredients = filter(lambda ingr: ingr, ingredients) for ingredient in ingredients: yield ingredient
def __to_unicode (self, record): for k, v in record.iteritems(): if k in ('prefecture_kana', 'city_kana', 'local_area_kana'): record[k] = zenhan.h2z(unicode(v, 'shift-jis'), zenhan.ALL) elif k in ('prefecture', 'city', 'local_area'): record[k] = unicode(v, 'shift-jis') return record
def __priceText(self, val, zen=False): prefix = '-' if price < 0 else '¥' string = prefix + "{:,d}".format(val) if zen: string = zenhan.h2z(string) space = (self.width - self.area) - (Receipt.strWidth(string)) return (" " * space) + string
def test_zenhan(): logging.info("=========================================") logging.info("= zenhan =") logging.info("=========================================") test_cases = get_test_cases() for tc in test_cases: title = tc['title'] body = tc['body'] logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title) logging.info("Not implemented") logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title) logging.info("Not implemented") logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title) logging.info("Not implemented") logging.info("半角 to 全角 for %s" % title) calc_time(zenhan.h2z, body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT) logging.debug("result: %s" % zenhan.h2z(body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT)) logging.info("全角 to 半角 for %s" % title) calc_time(zenhan.z2h, body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT) logging.debug("result: %s" % zenhan.z2h(body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT))
def _set_price(self, val, zen=False): prefix = '-' if val < 0 else '¥' string = prefix + "{:,d}".format(val) if zen: string = zenhan.h2z(string) space = (self.width - self.area) - (self.str_width(string)) return (" " * space) + string
def setType(string, types=[ 'L', 'P', 'S', 'X', 'スピリット', 'チューナー', 'デュアル', 'トゥーン', 'ユニオン', 'リバース', '儀式', '効果', '特殊召喚', '融合', '通常' ]): if '特召' in string: string = string.replace('特召', '特殊召喚') string = zenhan.h2z(string, zenhan.KANA) if not ('魔法' in string or '罠' in string): for type in types: if type in string: string = string.replace(type, '/' + type) string = string[1:] string = zenhan.h2z(string, 4) type_array = string.split('/') return type_array
def norm(s): s = s.split("※",1)[0] s = s.replace(" ", " ") s = s.replace("-", "-") s = zenhan.z2h(s, mode=7) s = zenhan.h2z(s, mode=4) s = s.strip() return s
def normalize(self,text): #アルファベット:全角=>半角 text = zenhan.z2h(text,mode=1) #数字:全角=>半角 text = zenhan.z2h(text,mode=2) #カタカナ:半角=>全角 text = zenhan.h2z(text,mode=4) return text
def conv(txt, unic=False): kZ = unicode(txt) kZ = zenhan.z2h(kZ) kZ = kZ.lower() kZ = zenhan.h2z(kZ) if unic: return kZ kZ = kZ.encode('utf8') return kZ
def delete_aft(line): text = zenhan.z2h(line, mode=1) #アルファベット(全角→半角) text = zenhan.z2h(text, mode=2) #数字(全角→半角) text = zenhan.h2z(text, mode=4) #カタカナ(半角→全角) text = re.sub( r'[\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u3004\u3007-\u303F\u3099-\u30A0\u30FB\u30FD-\u4DFF\uA000-\uE0FFF]', "", text) #その他文字列削除 return text
def get_bus_timetable(wbname, sheetname, stop_offset_row, stop_offset_col, stopdirection, timetable_offset_row, timetable_offset_col, chk_func): xls = xlsReader(wbname, sheetname) stop_name_list = [] if stopdirection == DataDirection.row: busdirection = DataDirection.col else: busdirection = DataDirection.row xls.set_offset(stop_offset_row, stop_offset_col) while True: v = xls.get_cell() if not v: break v = zenhan.h2z(v) v = v.replace('\n', '') stop_name_list.append(v) xls.next_cell(stopdirection) buslist = [] busrow = timetable_offset_row buscol = timetable_offset_col xls.set_offset(busrow, buscol) while True: setflg = False stoptime = [] if stopdirection == DataDirection.row: busrow = timetable_offset_row else: buscol = timetable_offset_col xls.set_offset(busrow, buscol) item = [] for i in range(len(stop_name_list)): v = xls.get_cell() if not v: busrow, buscol = xls.next_cell(stopdirection) continue item.append({ 'busstop': stop_name_list[i], 'busstopIx': i, 'time': '%02d:%02d' % (v[3], v[4]) }) setflg = True busrow, buscol = xls.next_cell(stopdirection) if not setflg: break if chk_func: if chk_func(xls.workbook, xls.sheet, busrow, buscol, item): buslist.append(item) else: buslist.append(item) busrow, buscol = xls.next_cell(busdirection) return buslist
def run(self, edit): for region in self.view.sel(): select_texts = self.view.substr(region) if select_texts != "": zen2han_text = zenhan.h2z(select_texts, zenhan.KANA) han2zen_text = zenhan.z2h(select_texts, zenhan.KANA) if select_texts != zen2han_text: self.view.replace(edit, region, zen2han_text) elif select_texts != han2zen_text: self.view.replace(edit, region, han2zen_text)
def run(self, edit): for region in self.view.sel(): select_texts = self.view.substr(region) if select_texts != "": zen2han_text = zenhan.h2z(select_texts,zenhan.KANA) han2zen_text = zenhan.z2h(select_texts,zenhan.KANA) if select_texts != zen2han_text: self.view.replace(edit, region, zen2han_text) elif select_texts != han2zen_text: self.view.replace(edit, region, han2zen_text)
def zenhan_search(self, statement, numOfResult): han_statement = zenhan.z2h(statement) zen_statement = zenhan.h2z(statement) han_list = self.tokenizer.split_query(han_statement) zen_list = self.tokenizer.split_query(zen_statement) if han_statement != zen_statement: to_search = han_list + zen_list else: to_search = self.tokenizer.split_query(statement) return self._search(to_search, numOfResult)
def delete_symbol(line): text = zenhan.z2h(line, mode=1) #アルファベット(全角→半角) text = zenhan.z2h(text, mode=2) #数字(全角→半角) text = zenhan.h2z(text, mode=4) #カタカナ(半角→全角) symbol = re.sub(r'[\u0000-\uE0FFF]', "", text) #unicode非対応の文字 text = re.sub( r'[\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u3004\u3007-\u303F\u3099-\u30A0\u30FB\u30FD-\u4DFF\uA000-\uE0FFF]', "", text) #その他文字列削除 """unicode非対応の文字の削除""" if not symbol == "": text = re.sub("[%s]" % symbol, "", text) return text
def normalize(data): NBSP = b"\xC2\xA0".decode("UTF-8") data = unicodedata.normalize("NFKC", zenhan.z2h(zenhan.h2z(data.replace(NBSP, "")))) # 0x2010 -- 0x2015 dashesU8 = [b'\xe2\x80\x90', b'\xe2\x80\x91', b'\xe2\x80\x92', b'\xe2\x80\x93', b'\xe2\x80\x94', b'\xe2\x80\x95'] dashes = "".join([s.decode("UTF-8") for s in dashesU8]) digits = re.match("^[0-9\\+\\-{0}]+$".format(dashes), data) if digits: for d in dashes: data = data.replace(d, "-") return data
def check(self, text): """textを全角 JIS X 0208で構成されるように変換・除去し,返す """ if type(text) != type(''): return -1 text2 = self.htmlentity2unicode(self.htmlentity2unicode(text)) text_norm = unicodedata.normalize('NFKC', text2) text_zen = zenhan.h2z(text_norm) zyokyo_list = [] for zen in text_zen: if zen not in self.char_set: zyokyo_list.append(zen) for zyokyo in zyokyo_list: text_zen = text_zen.replace(zyokyo, '') # 除去 return text_zen
def delete_twitter(line): text = zenhan.z2h(line, mode=1) #アルファベット(全角→半角) text = zenhan.z2h(text, mode=2) #数字(全角→半角) text = zenhan.h2z(text, mode=4) #カタカナ(半角→全角) symbol = re.sub(r'[\u0000-\uE0FFF]', "", text) #unicode非対応の文字 text = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text) #URL text = re.sub(r'@[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text) #ユーザ名 text = re.sub(r'#[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text) #ハッシュタグ text = re.sub( r'[\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u3004\u3007-\u303F\u3099-\u30A0\u30FB\u30FD-\u4DFF\uA000-\uE0FFF]', "", text) #その他文字列削除 """unicode非対応の文字の削除""" if not symbol == "": text = re.sub("[%s]" % symbol, "", text) return text
def build_corpus(target, corpus=None, raw=False): targetfp = open(target, "rb") corpusfp = open(corpus, "wb") for line in tqdm(targetfp): # hankaku kana to zenkaku kana line = zenhan.h2z(line.decode("utf-8"), mode=4) # tokenize and normalize verb line = tokenizer.wakati(line, normalize_verb=True) corpusfp.write(line + "\n") targetfp.close() corpusfp.close()
def kansuji2arabic(kstring): """ :param kstring: word which indicates a number :return: word represented by Arabic numerals """ # https://qiita.com/cof/items/58ddf898db25db561a54 tt_ksuji = str.maketrans('一二三四五六七八九〇壱弐参', '1234567890123') re_suji = re.compile(r'[十拾百千万億兆\d]+') re_kunit = re.compile(r'[十拾百千]|\d+') re_manshin = re.compile(r'[万億兆]|[^万億兆]+') TRANSUNIT = {'十': 10, '拾': 10, '百': 100, '千': 1000} TRANSMANS = {'万': 10000, '億': 100000000, '兆': 1000000000000} def _transvalue(sj, re_obj=re_kunit, transdic=TRANSUNIT): unit = 1 result = 0 for piece in reversed(re_obj.findall(sj)): if piece in transdic: if unit > 1: result += unit unit = transdic[piece] else: val = int(piece) if piece.isdecimal() else _transvalue(piece) result += val * unit unit = 1 if unit > 1: result += unit return result transuji = kstring.translate(tt_ksuji) for suji in sorted(set(re_suji.findall(transuji)), key=lambda s: len(s), reverse=True): if not suji.isdecimal(): arabic = _transvalue(suji, re_manshin, TRANSMANS) arabic = str(arabic) transuji = transuji.replace(suji, arabic) return zenhan.h2z(transuji)
def sentiment_analysis(): sentence = request.json['utterance'] sentence = zenhan.h2z(sentence) sentiment_analysis_client = xmlrpc_client.ServerProxy( f'http://{host}:{port}') prediction = sentiment_analysis_client.get_prediction( sentence) # 1(Pos) or -1(Neg) if prediction == 1: result = 'Positive' elif prediction == -1: result = 'Negative' else: result = 'None' body = {"status": "OK", "result": result} r = HTTPResponse(status=200, body=body) r.set_header("Content-Type", "application/json") return r
def preprocess_word(words): """ :param words: a list of Word instances :return: a list of Word instances with the preprocessed expressions """ n_words = len(words) for i in range(n_words): word = words[i] # remove symbols preprocessed_word = unicodedata.normalize("NFKC", word.surface) table = str.maketrans("", "", string.punctuation + "「」、。・") preprocessed_word = preprocessed_word.translate(table) # convert to zenkaku preprocessed_word = zenhan.h2z(preprocessed_word) # register preprocessed word words[i].p_surface = preprocessed_word words[i].alias = [preprocessed_word] return words
def month_search(line): """月の行かコメントアウト行か予定の行かを検出し,intで返す.""" # 月の表示のテンプレート month_temp = ['<h4><', '月></h4>'] # 検索方法が汚すぎる if month_temp[0] in line and month_temp[1] in line: month = int(zenhan.h2z(line.replace(month_temp[0], '').replace(month_temp[1], '').replace(' ', ''))) return month # コメントアウトされた予定を除外 elif '<!--' in line: # コメントアウトが一行で終わった場合の対策 if '-->' in line: return -1 else: return -2 # コメントアウト終了時 elif '-->' in line: return -1 # 予定の行の時 else: return 0
def flash_message(sender): textview = sender.superview['textview'] source = textview.text slider = sender.superview['slider'] speed = (.1 + slider.value) / 10 tokenizer = Tokenizer() tokens = tokenizer.tokenize(source) word = '' for token in tokens: if token.reading == '*': word += token.surface else: word += token.reading word = word.upper() word = zenhan.h2z(word) textview.text = word flash.flash_signals(word, speed)
def __init__(self, example_id, words, lines, knp_string=None, heads=None, token_tags=None, comment=None, h2z=False): self.example_id = example_id self.words = words self.lines = lines self.knp_string = knp_string self.heads = heads self.token_tags = token_tags self.token_tag_indices = defaultdict(list) self.comment = comment self.h2z = h2z if self.h2z is True: from copy import deepcopy import zenhan self.words_orig = deepcopy(self.words) self.words = [zenhan.h2z(word) for word in words]
def test_h2z_all(self): converted = zenhan.h2z(self.original, zenhan.ALL) self.assertEqual(converted, u("゜abcDE゛F123456アガサダナバビプペ゜")) self.assertEqual(zenhan.h2z(self.original, zenhan.ALL), zenhan.h2z(self.original, zenhan.ASCII|zenhan.DIGIT|zenhan.KANA))
def hankaku_to_zenkaku(text): '''半角文字を全角文字に変換します。''' return zenhan.h2z(text, mode=7)
def test_h2z_kana_only(self): converted = zenhan.h2z(self.original, zenhan.KANA) self.assertEqual(converted, u("゜abcDE゛F123456アガサダナバビプペ゜"))
def test_h2z_ascii_and_digit(self): converted = zenhan.h2z(self.original, zenhan.ASCII|zenhan.DIGIT) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
def test_h2z_ascii_and_kana(self): converted = zenhan.h2z(self.original, zenhan.ASCII|zenhan.KANA) self.assertEqual(converted, u("゜abcDE゛F123456アガサダナバビプペ゜"))
# from https://github.com/tosaka-m/japanese_realtime_tts/blob/master/src/jrtts/text_utils/convert.py from functools import reduce import jaconv import re import MeCab import unicodedata import zenhan from pykakasi import kakasi from text_jp.yomi2voca import mapper_no_space_removeST, keta_mapper import logging logger = logging.getLogger(__name__) mecab = MeCab.Tagger('-Ochasen') mecab_yomi = MeCab.Tagger('-Oyomi') digit_mapper = {i: mecab_yomi.parse(zenhan.h2z(str(i)))[:-1] for i in range(10)} _kakasi = kakasi() _kakasi.setMode('J', 'H') kakasi_converter = _kakasi.getConverter() def hiragana2onso(text): orig = text # 空白除去パターン text = _remove_special_space(text) for k, v in mapper_no_space_removeST.items(): text = text.replace(k, v) text= _kigou2roman(text, add_space=False) return text
def normalize(data): NBSP = b"\xC2\xA0".decode("UTF-8") return unicodedata.normalize("NFKC", zenhan.z2h(zenhan.h2z(data.replace(NBSP, ""))))
def normalize(data): NBSP = b"\xC2\xA0".decode("UTF-8") return unicodedata.normalize( "NFKC", zenhan.z2h(zenhan.h2z(data.replace(NBSP, ""))))
async def fw(self, ctx, *, chars): """Make full-width meme text.""" await ctx.message.edit(content=zenhan.h2z(chars))
def test_h2z_digit_only(self): converted = zenhan.h2z(self.original, zenhan.DIGIT) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
def get_recipe(url, dish): #print "**********" #print dish request = urllib2.Request(url) response = urllib2.urlopen(request) response1=response.read() soup = BeautifulSoup(response1,"html.parser") recipe ="" for p in soup.findAll('p',text=False): if p.text.find("人")!=-1 and p.text.find("材料")!=-1: recipe = p.text temps=recipe.split("\n") elements =dict() amount=0 people=0 # print recipe for temp in temps: # calculate number of people (done) if temp.find("材料")!=-1: people=float(re.search("[0-9]",zenhan.z2h(temp,2)).group()) # get each element for one man (done) elif temp!="": element=temp.replace("●","").replace("○","").replace("〇","").replace("◎","").lstrip(" ") if temp.find("…")!=-1: element= element.split("…") else: element= element.split(None,1) # print element if people!=0: # print people # convert all string to hankaku if len(element) >= 2: han_element = zenhan.z2h(element[1],2) else: # print element break # march unit #print element[1] #print han_element unit=re.search("[^0-9\/~ ]+",han_element).group(0) #print zenhan.z2h(element[1],2) string_amount=re.search("[0-9\/ ]+",han_element.replace(unit,"")) #print string_amount if string_amount!=None: amount= float(sum(Fraction(s) for s in string_amount.group(0).split()))/people else: amount =0 else: print "people=0" #print element[0]+"\t"+str(amount)+"\t"+unit elements.update({element[0]:[amount,zenhan.h2z(unit,4)]}) return elements
#!/usr/bin/env python # -*- coding: utf-8 -*- import json import requests import sys import zenhan server_host = 'localhost' server_port = '5000' if len(sys.argv) <= 1: print "Usage: %s <text>" % sys.argv[0] sys.exit(0) text = zenhan.h2z(sys.argv[1].decode('utf-8')) r = requests.get('http://%s:%s/jumanpp?q=%s' % (server_host, server_port, text)) assert r.status_code == 200 d = json.loads(r.text) print d['text'] for j in d['jumanpp']: print "['" + "','".join(j) + "']"
def clean_first_name(self): first_name = self.cleaned_data.get("first_name") if not Strings.is_kana(first_name): raise forms.ValidationError("入力できるのはカタカナのみです。") return zenhan.h2z(first_name)
def test_h2z_digit_and_kana(self): converted = zenhan.h2z(self.original, zenhan.DIGIT|zenhan.KANA) self.assertEqual(converted, u("゜abcDE゛F123456アガサダナバビプペ゜"))
def convert_zenkau(self, src): sMsg = src.decode('utf-8') sZen = zenhan.h2z(sMsg ,mode=7, ignore=()) return sZen
def h2z(self, text): """文字列中の半角文字を全て全角文字に変換する。 @param string text 変換対象の文字列 @return string 変換結果""" return zenhan.h2z(unicode(text, "utf-8")).encode("utf-8") #convert hankaku into zenkaku
def h2z(self, text): """文字列中の半角文字を全て全角文字に変換する。 @param string text 変換対象の文字列 @return string 変換結果""" return zenhan.h2z(unicode(text, "utf-8")).encode( "utf-8") #convert hankaku into zenkaku
if m.surface: #help (m) key = m.surface info = m.feature.split(",") label = info[0] label2 = info[1] bform = info[-3] prono = info[-2] #print key,m.feature if label == '名詞' and label2 not in ['接尾', '非自立', '特殊', '代名詞']: result[key] = result.get(key, 0) + 1 #print m.surface, "\t", label, bform, prono, m.feature m = m.next for k, v in sorted(result.items(), key=lambda x:x[1], reverse=True): print k, v def pyon (s): end = ["。","!","?",".","・","…","\n","\r"] print re.sub("((?:%s)+)" % "|".join(end), 'ぴょん\\1', s) if __name__ == "__main__": s = sentence if len(sys.argv) > 1: s = readStr(sys.argv[1]) s = zenhan.h2z(s.decode('utf-8')).encode('utf-8') #showHist (s) pyon (s)
def normalize(sentence): sentence = unicodedata.normalize('NFKC', sentence) sentence = zenhan.h2z(sentence) sentence = sentence.replace(u' ', u'').replace(u' ', u'') return re.sub("\s*", u'', sentence)
def convert_zenkau(self, src): sMsg = src.decode('utf-8') sZen = zenhan.h2z(sMsg, mode=7, ignore=()) return sZen
from pyknp import Juman jumanpp = Juman() turn_dir = sys.argv[1] wakati_dir = sys.argv[2] tsvs = os.listdir(turn_dir) for tsv in tqdm(tsvs): tmp = [] with open(os.path.join(turn_dir, tsv), newline='') as f: reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: wakati_row = [] for text in row: result = jumanpp.analysis(zenhan.h2z(text)) wakati_text = ' '.join(mrph.midasi for mrph in result.mrph_list()) wakati_row.append(wakati_text) tmp.append(wakati_row) with open(os.path.join(wakati_dir, tsv), 'w', newline='') as f: writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONE) writer.writerows(tmp)
def test_h2z_ascii_only(self): converted = zenhan.h2z(self.original, zenhan.ASCII) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
#coding:utf-8 from functools import reduce import jaconv import re import MeCab import unicodedata import zenhan from pykakasi import kakasi from .yomi2voca import mapper_no_space_removeST, keta_mapper import logging logger = logging.getLogger(__name__) mecab = MeCab.Tagger('-Ochasen') mecab_yomi = MeCab.Tagger('-Oyomi') digit_mapper = { i: mecab_yomi.parse(zenhan.h2z(str(i)))[:-1] for i in range(10) } _kakasi = kakasi() _kakasi.setMode('J', 'H') kakasi_converter = _kakasi.getConverter() def hiragana2onso(text): orig = text # 空白除去パターン text = _remove_special_space(text) for k, v in mapper_no_space_removeST.items(): text = text.replace(k, v)