Пример #1
0
def run(generator, normalize_space=False):
    for line in generator:
        line = line.strip('\n')
        if normalize_space:
            linez = zenhan.h2z(line, mode=7)
        else:
            linez = zenhan.h2z(line, mode=7, ignore=' ')
        print(linez)
Пример #2
0
def normalize(ingredient):
    ingredient = ingredient.strip()

    for SURROUND in SURROUNDS:
        ingredient = SURROUND.sub(lambda s: '', ingredient)

    ingredient = OPTIONAL_START.sub(lambda s: '', ingredient)

    match = UNCLOSED_PAREN.match(ingredient)
    if match:
        ingredient = match.groups()[0]

    ingredient = zenhan.z2h(ingredient, mode=1)  # ascii
    ingredient = zenhan.h2z(ingredient, mode=4)  # kana

    # convert all katakana to hiragana
    ingredient = hiragana(ingredient)

    match = STARTS_WITH_ALPHA.match(ingredient)
    if match and not ingredient.startswith('S&B'):
        ingredient = match.groups()[0]

    for SPECIAL_SYMBOL in SPECIAL_SYMBOLS:
        ingredient = SPECIAL_SYMBOL.sub(lambda s: '', ingredient)

    ingredients = SPLIT.split(ingredient)
    ingredients = map(lambda ingr: ENDS_WITH.sub(lambda s: '', ingr), ingredients)
    ingredients = map(lambda ingr: ingr.strip(), ingredients)
    ingredients = filter(lambda ingr: ingr, ingredients)

    for ingredient in ingredients:
        yield ingredient
 def __to_unicode (self, record):
     for k, v in record.iteritems():
         if k in ('prefecture_kana', 'city_kana', 'local_area_kana'):
             record[k] = zenhan.h2z(unicode(v, 'shift-jis'), zenhan.ALL)
         elif k in ('prefecture', 'city', 'local_area'):
             record[k] = unicode(v, 'shift-jis')
     return record
Пример #4
0
	def __priceText(self, val, zen=False):
		prefix = '-' if price < 0 else '¥'
		string = prefix + "{:,d}".format(val)
		if zen:
			string = zenhan.h2z(string)
		space = (self.width - self.area) - (Receipt.strWidth(string))
		return (" " * space) + string
Пример #5
0
def test_zenhan():
    logging.info("=========================================")
    logging.info("=               zenhan                  =")
    logging.info("=========================================")
    test_cases = get_test_cases()
    for tc in test_cases:
        title = tc['title']
        body = tc['body']

        logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title)
        logging.info("Not implemented")

        logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title)
        logging.info("Not implemented")

        logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title)
        logging.info("Not implemented")

        logging.info("半角 to 全角 for %s" % title)
        calc_time(zenhan.h2z, body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT)
        logging.debug("result: %s" % zenhan.h2z(body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT))

        logging.info("全角 to 半角 for %s" % title)
        calc_time(zenhan.z2h, body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT)
        logging.debug("result: %s" % zenhan.z2h(body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT))
Пример #6
0
 def _set_price(self, val, zen=False):
     prefix = '-' if val < 0 else '¥'
     string = prefix + "{:,d}".format(val)
     if zen:
         string = zenhan.h2z(string)
     space = (self.width - self.area) - (self.str_width(string))
     return (" " * space) + string
Пример #7
0
def setType(string,
            types=[
                'L', 'P', 'S', 'X', 'スピリット', 'チューナー', 'デュアル', 'トゥーン', 'ユニオン',
                'リバース', '儀式', '効果', '特殊召喚', '融合', '通常'
            ]):
    if '特召' in string:
        string = string.replace('特召', '特殊召喚')
    string = zenhan.h2z(string, zenhan.KANA)
    if not ('魔法' in string or '罠' in string):
        for type in types:
            if type in string:
                string = string.replace(type, '/' + type)
        string = string[1:]
    string = zenhan.h2z(string, 4)
    type_array = string.split('/')
    return type_array
Пример #8
0
	def norm(s):
		s = s.split("※",1)[0]
		s = s.replace(" ", " ")
		s = s.replace("-", "-")
		s = zenhan.z2h(s, mode=7)
		s = zenhan.h2z(s, mode=4)
		s = s.strip()
		return s
Пример #9
0
 def normalize(self,text):
     #アルファベット:全角=>半角
     text = zenhan.z2h(text,mode=1)
     #数字:全角=>半角
     text = zenhan.z2h(text,mode=2)
     #カタカナ:半角=>全角
     text = zenhan.h2z(text,mode=4)
     return text
def conv(txt, unic=False):
    kZ = unicode(txt)
    kZ = zenhan.z2h(kZ)
    kZ = kZ.lower()
    kZ = zenhan.h2z(kZ)
    if unic:
        return kZ
    kZ = kZ.encode('utf8')
    return kZ
Пример #11
0
def delete_aft(line):
    text = zenhan.z2h(line, mode=1)  #アルファベット(全角→半角)
    text = zenhan.z2h(text, mode=2)  #数字(全角→半角)
    text = zenhan.h2z(text, mode=4)  #カタカナ(半角→全角)

    text = re.sub(
        r'[\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u3004\u3007-\u303F\u3099-\u30A0\u30FB\u30FD-\u4DFF\uA000-\uE0FFF]',
        "", text)  #その他文字列削除
    return text
Пример #12
0
def get_bus_timetable(wbname, sheetname, stop_offset_row, stop_offset_col,
                      stopdirection, timetable_offset_row,
                      timetable_offset_col, chk_func):
    xls = xlsReader(wbname, sheetname)
    stop_name_list = []
    if stopdirection == DataDirection.row:
        busdirection = DataDirection.col
    else:
        busdirection = DataDirection.row
    xls.set_offset(stop_offset_row, stop_offset_col)
    while True:
        v = xls.get_cell()
        if not v:
            break
        v = zenhan.h2z(v)
        v = v.replace('\n', '')
        stop_name_list.append(v)
        xls.next_cell(stopdirection)

    buslist = []

    busrow = timetable_offset_row
    buscol = timetable_offset_col
    xls.set_offset(busrow, buscol)

    while True:
        setflg = False
        stoptime = []
        if stopdirection == DataDirection.row:
            busrow = timetable_offset_row
        else:
            buscol = timetable_offset_col
        xls.set_offset(busrow, buscol)
        item = []
        for i in range(len(stop_name_list)):
            v = xls.get_cell()
            if not v:
                busrow, buscol = xls.next_cell(stopdirection)
                continue
            item.append({
                'busstop': stop_name_list[i],
                'busstopIx': i,
                'time': '%02d:%02d' % (v[3], v[4])
            })
            setflg = True
            busrow, buscol = xls.next_cell(stopdirection)
        if not setflg:
            break
        if chk_func:
            if chk_func(xls.workbook, xls.sheet, busrow, buscol, item):
                buslist.append(item)
        else:
            buslist.append(item)
        busrow, buscol = xls.next_cell(busdirection)
    return buslist
Пример #13
0
    def run(self, edit):
        for region in self.view.sel():
            select_texts = self.view.substr(region)

            if select_texts != "":
                zen2han_text = zenhan.h2z(select_texts, zenhan.KANA)
                han2zen_text = zenhan.z2h(select_texts, zenhan.KANA)
                if select_texts != zen2han_text:
                    self.view.replace(edit, region, zen2han_text)
                elif select_texts != han2zen_text:
                    self.view.replace(edit, region, han2zen_text)
Пример #14
0
  def run(self, edit):
    for region in self.view.sel():
        select_texts = self.view.substr(region)

        if select_texts != "":
            zen2han_text = zenhan.h2z(select_texts,zenhan.KANA)
            han2zen_text = zenhan.z2h(select_texts,zenhan.KANA)
            if select_texts != zen2han_text:
                self.view.replace(edit, region, zen2han_text)
            elif select_texts != han2zen_text:
                self.view.replace(edit, region, han2zen_text)
Пример #15
0
  def zenhan_search(self, statement, numOfResult):
    han_statement = zenhan.z2h(statement)
    zen_statement = zenhan.h2z(statement)
    
    han_list = self.tokenizer.split_query(han_statement)
    zen_list = self.tokenizer.split_query(zen_statement)
    
    if han_statement != zen_statement:
      to_search = han_list + zen_list
    else:
      to_search = self.tokenizer.split_query(statement) 

    return self._search(to_search, numOfResult)
Пример #16
0
def delete_symbol(line):
    text = zenhan.z2h(line, mode=1)  #アルファベット(全角→半角)
    text = zenhan.z2h(text, mode=2)  #数字(全角→半角)
    text = zenhan.h2z(text, mode=4)  #カタカナ(半角→全角)

    symbol = re.sub(r'[\u0000-\uE0FFF]', "", text)  #unicode非対応の文字
    text = re.sub(
        r'[\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u3004\u3007-\u303F\u3099-\u30A0\u30FB\u30FD-\u4DFF\uA000-\uE0FFF]',
        "", text)  #その他文字列削除
    """unicode非対応の文字の削除"""
    if not symbol == "":
        text = re.sub("[%s]" % symbol, "", text)
    return text
Пример #17
0
    def zenhan_search(self, statement, numOfResult):
        han_statement = zenhan.z2h(statement)
        zen_statement = zenhan.h2z(statement)

        han_list = self.tokenizer.split_query(han_statement)
        zen_list = self.tokenizer.split_query(zen_statement)

        if han_statement != zen_statement:
            to_search = han_list + zen_list
        else:
            to_search = self.tokenizer.split_query(statement)

        return self._search(to_search, numOfResult)
Пример #18
0
def normalize(data):
	NBSP = b"\xC2\xA0".decode("UTF-8")
	data = unicodedata.normalize("NFKC", zenhan.z2h(zenhan.h2z(data.replace(NBSP, ""))))
	
	# 0x2010 -- 0x2015
	dashesU8 = [b'\xe2\x80\x90', b'\xe2\x80\x91', b'\xe2\x80\x92', b'\xe2\x80\x93', b'\xe2\x80\x94', b'\xe2\x80\x95']
	dashes = "".join([s.decode("UTF-8") for s in dashesU8])
	digits = re.match("^[0-9\\+\\-{0}]+$".format(dashes), data)
	if digits:
		for d in dashes:
			data = data.replace(d, "-")
	
	return data
Пример #19
0
 def check(self, text):
     """textを全角 JIS X 0208で構成されるように変換・除去し,返す
     """
     if type(text) != type(''):
         return -1
     text2 = self.htmlentity2unicode(self.htmlentity2unicode(text))
     text_norm = unicodedata.normalize('NFKC', text2)
     text_zen = zenhan.h2z(text_norm)
     zyokyo_list = []
     for zen in text_zen:
         if zen not in self.char_set:
             zyokyo_list.append(zen)
     for zyokyo in zyokyo_list:
         text_zen = text_zen.replace(zyokyo, '')  # 除去
     return text_zen
Пример #20
0
def delete_twitter(line):
    text = zenhan.z2h(line, mode=1)  #アルファベット(全角→半角)
    text = zenhan.z2h(text, mode=2)  #数字(全角→半角)
    text = zenhan.h2z(text, mode=4)  #カタカナ(半角→全角)

    symbol = re.sub(r'[\u0000-\uE0FFF]', "", text)  #unicode非対応の文字
    text = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)  #URL
    text = re.sub(r'@[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)  #ユーザ名
    text = re.sub(r'#[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)  #ハッシュタグ
    text = re.sub(
        r'[\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u3004\u3007-\u303F\u3099-\u30A0\u30FB\u30FD-\u4DFF\uA000-\uE0FFF]',
        "", text)  #その他文字列削除
    """unicode非対応の文字の削除"""
    if not symbol == "":
        text = re.sub("[%s]" % symbol, "", text)
    return text
Пример #21
0
def build_corpus(target, corpus=None, raw=False):

    targetfp = open(target, "rb")
    corpusfp = open(corpus, "wb")

    for line in tqdm(targetfp):

        # hankaku kana to zenkaku kana
        line = zenhan.h2z(line.decode("utf-8"), mode=4)

        # tokenize and normalize verb
        line = tokenizer.wakati(line, normalize_verb=True)

        corpusfp.write(line + "\n")

    targetfp.close()
    corpusfp.close()
def kansuji2arabic(kstring):
    """

    :param kstring: word which indicates a number
    :return: word represented by Arabic numerals

    """
    # https://qiita.com/cof/items/58ddf898db25db561a54
    tt_ksuji = str.maketrans('一二三四五六七八九〇壱弐参', '1234567890123')
    re_suji = re.compile(r'[十拾百千万億兆\d]+')
    re_kunit = re.compile(r'[十拾百千]|\d+')
    re_manshin = re.compile(r'[万億兆]|[^万億兆]+')
    TRANSUNIT = {'十': 10, '拾': 10, '百': 100, '千': 1000}
    TRANSMANS = {'万': 10000, '億': 100000000, '兆': 1000000000000}

    def _transvalue(sj, re_obj=re_kunit, transdic=TRANSUNIT):
        unit = 1
        result = 0
        for piece in reversed(re_obj.findall(sj)):
            if piece in transdic:
                if unit > 1:
                    result += unit
                unit = transdic[piece]
            else:
                val = int(piece) if piece.isdecimal() else _transvalue(piece)
                result += val * unit
                unit = 1

        if unit > 1:
            result += unit

        return result

    transuji = kstring.translate(tt_ksuji)
    for suji in sorted(set(re_suji.findall(transuji)),
                       key=lambda s: len(s),
                       reverse=True):
        if not suji.isdecimal():
            arabic = _transvalue(suji, re_manshin, TRANSMANS)
            arabic = str(arabic)
            transuji = transuji.replace(suji, arabic)

    return zenhan.h2z(transuji)
Пример #23
0
def sentiment_analysis():
    sentence = request.json['utterance']
    sentence = zenhan.h2z(sentence)
    sentiment_analysis_client = xmlrpc_client.ServerProxy(
        f'http://{host}:{port}')
    prediction = sentiment_analysis_client.get_prediction(
        sentence)  # 1(Pos) or -1(Neg)

    if prediction == 1:
        result = 'Positive'
    elif prediction == -1:
        result = 'Negative'
    else:
        result = 'None'

    body = {"status": "OK", "result": result}
    r = HTTPResponse(status=200, body=body)
    r.set_header("Content-Type", "application/json")
    return r
def preprocess_word(words):
    """

    :param words: a list of Word instances
    :return: a list of Word instances with the preprocessed expressions

    """
    n_words = len(words)
    for i in range(n_words):
        word = words[i]
        # remove symbols
        preprocessed_word = unicodedata.normalize("NFKC", word.surface)
        table = str.maketrans("", "", string.punctuation + "「」、。・")
        preprocessed_word = preprocessed_word.translate(table)
        # convert to zenkaku
        preprocessed_word = zenhan.h2z(preprocessed_word)
        # register preprocessed word
        words[i].p_surface = preprocessed_word
        words[i].alias = [preprocessed_word]
    return words
Пример #25
0
def month_search(line):
    """月の行かコメントアウト行か予定の行かを検出し,intで返す."""
    # 月の表示のテンプレート
    month_temp = ['<h4><', '月></h4>']
    # 検索方法が汚すぎる
    if month_temp[0] in line and month_temp[1] in line:
        month = int(zenhan.h2z(line.replace(month_temp[0], '').replace(month_temp[1], '').replace(' ', '')))
        return month
    # コメントアウトされた予定を除外
    elif '<!--' in line:
        # コメントアウトが一行で終わった場合の対策
        if '-->' in line:
            return -1
        else:
            return -2
    # コメントアウト終了時
    elif '-->' in line:
        return -1
    # 予定の行の時
    else:
        return 0
Пример #26
0
def flash_message(sender):
    textview = sender.superview['textview']
    source = textview.text

    slider = sender.superview['slider']
    speed = (.1 + slider.value) / 10
    tokenizer = Tokenizer()
    tokens = tokenizer.tokenize(source)

    word = ''
    for token in tokens:
        if token.reading == '*':
            word += token.surface
        else:
            word += token.reading

    word = word.upper()
    word = zenhan.h2z(word)
    textview.text = word

    flash.flash_signals(word, speed)
Пример #27
0
    def __init__(self,
                 example_id,
                 words,
                 lines,
                 knp_string=None,
                 heads=None,
                 token_tags=None,
                 comment=None,
                 h2z=False):
        self.example_id = example_id
        self.words = words
        self.lines = lines
        self.knp_string = knp_string
        self.heads = heads
        self.token_tags = token_tags
        self.token_tag_indices = defaultdict(list)
        self.comment = comment
        self.h2z = h2z

        if self.h2z is True:
            from copy import deepcopy
            import zenhan
            self.words_orig = deepcopy(self.words)
            self.words = [zenhan.h2z(word) for word in words]
Пример #28
0
 def test_h2z_all(self):
     converted = zenhan.h2z(self.original, zenhan.ALL)
     self.assertEqual(converted, u("゜abcDE゛F123456アガサダナバビプペ゜"))
     self.assertEqual(zenhan.h2z(self.original, zenhan.ALL),
                      zenhan.h2z(self.original,
                                 zenhan.ASCII|zenhan.DIGIT|zenhan.KANA))
Пример #29
0
def hankaku_to_zenkaku(text):
    '''半角文字を全角文字に変換します。'''
    return zenhan.h2z(text, mode=7)
Пример #30
0
 def test_h2z_kana_only(self):
     converted = zenhan.h2z(self.original, zenhan.KANA)
     self.assertEqual(converted, u("゜abcDE゛F123456アガサダナバビプペ゜"))
Пример #31
0
 def test_h2z_ascii_and_digit(self):
     converted = zenhan.h2z(self.original, zenhan.ASCII|zenhan.DIGIT)
     self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
Пример #32
0
 def test_h2z_ascii_and_kana(self):
     converted = zenhan.h2z(self.original, zenhan.ASCII|zenhan.KANA)
     self.assertEqual(converted, u("゜abcDE゛F123456アガサダナバビプペ゜"))
Пример #33
0
# from https://github.com/tosaka-m/japanese_realtime_tts/blob/master/src/jrtts/text_utils/convert.py
from functools import reduce
import jaconv
import re
import MeCab
import unicodedata
import zenhan
from pykakasi import kakasi
from text_jp.yomi2voca import mapper_no_space_removeST, keta_mapper
import logging
logger = logging.getLogger(__name__)
mecab = MeCab.Tagger('-Ochasen')
mecab_yomi = MeCab.Tagger('-Oyomi')
digit_mapper = {i: mecab_yomi.parse(zenhan.h2z(str(i)))[:-1] for i in range(10)}
_kakasi = kakasi()
_kakasi.setMode('J', 'H')
kakasi_converter = _kakasi.getConverter()


def hiragana2onso(text):
    orig = text

    # 空白除去パターン
    text = _remove_special_space(text)

    for k, v in mapper_no_space_removeST.items():
        text = text.replace(k, v)

    text= _kigou2roman(text, add_space=False)
    return text
Пример #34
0
def normalize(data):
	NBSP = b"\xC2\xA0".decode("UTF-8")
	return unicodedata.normalize("NFKC", zenhan.z2h(zenhan.h2z(data.replace(NBSP, ""))))
Пример #35
0
def normalize(data):
    NBSP = b"\xC2\xA0".decode("UTF-8")
    return unicodedata.normalize(
        "NFKC", zenhan.z2h(zenhan.h2z(data.replace(NBSP, ""))))
Пример #36
0
 async def fw(self, ctx, *, chars):
     """Make full-width meme text."""
     await ctx.message.edit(content=zenhan.h2z(chars))
Пример #37
0
 def test_h2z_digit_only(self):
     converted = zenhan.h2z(self.original, zenhan.DIGIT)
     self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
Пример #38
0
def get_recipe(url, dish):
    #print "**********"
    #print dish
    
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    response1=response.read()
    
    soup = BeautifulSoup(response1,"html.parser")
    recipe =""
    for p in soup.findAll('p',text=False):  
        if p.text.find("人")!=-1 and p.text.find("材料")!=-1:
            recipe = p.text
    
    temps=recipe.split("\n")        
    elements =dict()
    amount=0
    people=0
    # print recipe
    for temp in temps:
        
        # calculate number of people (done)
        if temp.find("材料")!=-1:
            people=float(re.search("[0-9]",zenhan.z2h(temp,2)).group())
            
        # get each element for one man (done)
        elif temp!="":
            
            element=temp.replace("●","").replace("○","").replace("〇","").replace("◎","").lstrip(" ")
            if temp.find("…")!=-1:
                element= element.split("…")
            else:
                element= element.split(None,1)
            # print element    
            if people!=0:
                # print people
                # convert all string to hankaku
                if len(element) >= 2: 
                    han_element = zenhan.z2h(element[1],2)
                else: 
                    # print element
                    break
                # march unit
                #print element[1]
                #print han_element
                
                unit=re.search("[^0-9\/~ ]+",han_element).group(0)
                
                #print zenhan.z2h(element[1],2) 
                string_amount=re.search("[0-9\/ ]+",han_element.replace(unit,""))  
                #print string_amount
                if string_amount!=None:
                    amount= float(sum(Fraction(s) for s in string_amount.group(0).split()))/people
                else:
                    amount =0    
            else:
                print "people=0"
            #print element[0]+"\t"+str(amount)+"\t"+unit                
            elements.update({element[0]:[amount,zenhan.h2z(unit,4)]})

    return elements
Пример #39
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import requests
import sys

import zenhan

server_host = 'localhost'
server_port = '5000'

if len(sys.argv) <= 1:
    print "Usage: %s <text>" % sys.argv[0]
    sys.exit(0)

text = zenhan.h2z(sys.argv[1].decode('utf-8'))

r = requests.get('http://%s:%s/jumanpp?q=%s' % (server_host, server_port, text))
assert r.status_code == 200

d = json.loads(r.text)

print d['text']
for j in d['jumanpp']:
    print "['" + "','".join(j) + "']"
Пример #40
0
 def clean_first_name(self):
     first_name = self.cleaned_data.get("first_name")
     if not Strings.is_kana(first_name):
         raise forms.ValidationError("入力できるのはカタカナのみです。")
     return zenhan.h2z(first_name)
Пример #41
0
 def test_h2z_digit_and_kana(self):
     converted = zenhan.h2z(self.original, zenhan.DIGIT|zenhan.KANA)
     self.assertEqual(converted, u("゜abcDE゛F123456アガサダナバビプペ゜"))
Пример #42
0
	def convert_zenkau(self, src):
		sMsg = src.decode('utf-8')
		sZen = zenhan.h2z(sMsg ,mode=7, ignore=())
		return sZen
Пример #43
0
	def h2z(self, text):
		"""文字列中の半角文字を全て全角文字に変換する。
		@param string text 変換対象の文字列
		@return string 変換結果"""
		return zenhan.h2z(unicode(text, "utf-8")).encode("utf-8")  #convert hankaku into zenkaku
Пример #44
0
    def h2z(self, text):
        """文字列中の半角文字を全て全角文字に変換する。
		@param string text 変換対象の文字列
		@return string 変換結果"""
        return zenhan.h2z(unicode(text, "utf-8")).encode(
            "utf-8")  #convert hankaku into zenkaku
Пример #45
0
        if m.surface:
            #help (m)
            key = m.surface
            info = m.feature.split(",")
            label = info[0]
            label2 = info[1]
            bform = info[-3]
            prono = info[-2]
            #print key,m.feature
            if label == '名詞' and label2 not in ['接尾', '非自立', '特殊', '代名詞']:
               result[key]  = result.get(key, 0) + 1
            #print m.surface, "\t", label, bform, prono, m.feature
        m = m.next
    for k, v in sorted(result.items(), key=lambda x:x[1], reverse=True):
        print k, v

def pyon (s):
    end = ["。","!","?",".","・","…","\n","\r"]
    print re.sub("((?:%s)+)" % "|".join(end), 'ぴょん\\1', s)

if __name__ == "__main__":
    s = sentence
    if len(sys.argv) > 1:
        s = readStr(sys.argv[1])
    s = zenhan.h2z(s.decode('utf-8')).encode('utf-8')
    #showHist (s)
    pyon (s)



Пример #46
0
def normalize(sentence):
  sentence = unicodedata.normalize('NFKC', sentence)
  sentence = zenhan.h2z(sentence)
  sentence = sentence.replace(u' ', u'').replace(u' ', u'')
  return re.sub("\s*", u'', sentence)
 def convert_zenkau(self, src):
     sMsg = src.decode('utf-8')
     sZen = zenhan.h2z(sMsg, mode=7, ignore=())
     return sZen
Пример #48
0
from pyknp import Juman

jumanpp = Juman()

turn_dir = sys.argv[1]
wakati_dir = sys.argv[2]

tsvs = os.listdir(turn_dir)

for tsv in tqdm(tsvs):
    tmp = []
    with open(os.path.join(turn_dir, tsv), newline='') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            wakati_row = []

            for text in row:
                result = jumanpp.analysis(zenhan.h2z(text))
                wakati_text = ' '.join(mrph.midasi
                                       for mrph in result.mrph_list())

                wakati_row.append(wakati_text)

            tmp.append(wakati_row)

    with open(os.path.join(wakati_dir, tsv), 'w', newline='') as f:
        writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        writer.writerows(tmp)
Пример #49
0
 def test_h2z_ascii_only(self):
     converted = zenhan.h2z(self.original, zenhan.ASCII)
     self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
Пример #50
0
#coding:utf-8
from functools import reduce
import jaconv
import re
import MeCab
import unicodedata
import zenhan
from pykakasi import kakasi
from .yomi2voca import mapper_no_space_removeST, keta_mapper
import logging
logger = logging.getLogger(__name__)
mecab = MeCab.Tagger('-Ochasen')
mecab_yomi = MeCab.Tagger('-Oyomi')
digit_mapper = {
    i: mecab_yomi.parse(zenhan.h2z(str(i)))[:-1]
    for i in range(10)
}
_kakasi = kakasi()
_kakasi.setMode('J', 'H')
kakasi_converter = _kakasi.getConverter()


def hiragana2onso(text):
    orig = text

    # 空白除去パターン
    text = _remove_special_space(text)

    for k, v in mapper_no_space_removeST.items():
        text = text.replace(k, v)