Python han_to_zen 예제들, mojimoji.han_to_zen Python 예제들

예제 #1

0

파일 보기

파일: test_mojimoji.py 프로젝트: shogo82148/mojimoji

def test_han_to_zen():
    eq_(u'アイウエオ', mojimoji.han_to_zen(u'ｱｲｳｴｵ'))
    eq_(u'ガギグゲゴ', mojimoji.han_to_zen(u'ｶﾞｷﾞｸﾞｹﾞｺﾞ'))
    eq_(u'パピプペポ', mojimoji.han_to_zen(u'ﾊﾟﾋﾟﾌﾟﾍﾟﾎﾟ'))
    eq_(u'０１２３', mojimoji.han_to_zen(u'0123'))
    eq_(u'ａｂｃＡＢＣ', mojimoji.han_to_zen(u'abcABC'))
    eq_(u'＃？！', mojimoji.han_to_zen(u'#?!'))
    eq_(u'あいうえお', mojimoji.han_to_zen(u'あいうえお'))

예제 #2

0

파일 보기

파일: tweet_parser.py 프로젝트: yuta-portfolio/portfolio

    def wakachi(self):
        u"""分かち書きを行う

        Returns:
            辞書型で結果を返す
        """
        md=config.m_mecab_dic

        tagger=MeCab.Tagger(md.option)
        tagger.parse('')


        emoji=re.compile(u'^U00')
        kigou=re.compile(u'^[!-~]$')

        # 全角半角を正規化
        self.text=mojimoji.zen_to_han(self.text,kana=False,digit=True,ascii=True)
        self.text=mojimoji.han_to_zen(self.text,kana=True,digit=False,ascii=False)

        node=tagger.parseToNode(self.text.encode('utf-8'))
        words=[]

        while node:
            pos=node.feature.split(",")[md.pos]
            if pos=="形容詞" or pos == "形容動詞" or pos=="動詞" or pos=="名詞":
                if len(node.feature.split(","))<=md.base:
                    base = node.surface
                else:
                    base=node.feature.split(",")[md.base]

                if base == "*":
                    base = node.surface
                # 絵文字、ひらがな、カタカナ一文字は除外
                if (emoji.match(unicode(base)) is not None) or (kigou.match(unicode(base)) is not None):
                    pass
                # ストップワードに含まれたら除外
                elif unicode(base) in get_stopwords():
                    pass
                else:
                    # 大文字は小文字化して格納する
                    words.append(base.lower())
            node=node.next

        wakachi=map(str,words)
        wakachi = " ".join(wakachi)

        if "\n" in wakachi:
            wakachi=wakachi.split("\n",1)[0].strip()
        self.wakachigaki=wakachi

        return {'_id':self.id,'screen_name':self.screen_name,'text':self.text,'wakachi':wakachi}

예제 #3

0

파일 보기

파일: views.py 프로젝트: murabo/shop

def _check_ng(kwd, ng_list):

    for ng in ng_list:

        if ng == "":
            return False
        if kwd.find(ng) >= 0:

            return True

        if kwd.lower().find(ng) >= 0:

            return True

        if kwd.find(mojimoji.han_to_zen(ng.decode('utf-8')).encode('utf-8')) >= 0:

            return True

예제 #4

0

파일 보기

 def nlp_preprocessing(self, text: str):
     for val in self.list_replace_text:
         if isinstance(val, list) or isinstance(val, tuple):
             b_rep, a_rep = val
             text = re.sub(b_rep, a_rep, text)
         elif isinstance(val, str):
             if val == "hantozen":
                 # 半角、全角の変換
                 text = mojimoji.han_to_zen(text)
             elif val == "tokenize":
                 text = " ".join(
                     [x.text for x in self.spacy_tokenizer(text)])
     # Tokenize
     ret = self.tokenizer.tokenize(text)
     # Replace Tokens
     ret = np.array(ret)
     for b_rep, a_rep in self.list_replace_token:
         ret[ret == b_rep] = a_rep
     return ret.tolist()

예제 #5

0

파일 보기

파일: utils.py 프로젝트: sinjorjob/django-bert-sentiment

def preprocessing_text(text):
    # 半角・全角の統一
    text = mojimoji.han_to_zen(text)
    # 改行、半角スペース、全角スペースを削除
    text = re.sub('\r', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('　', '', text)
    text = re.sub(' ', '', text)
    # 数字文字の一律「0」化
    text = re.sub(r'[0-9 ０-９]+', '0', text)  # 数字

    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        if (p == ".") or (p == ","):
            continue
        else:
            text = text.replace(p, " ")

    return text

예제 #6

0

파일 보기

파일: make_dataloader.py 프로젝트: karakuri-ai/hottopi-san

    def preprocessing_text(self, text):

        # 半角・全角の統一
        text = mojimoji.han_to_zen(text)
        # 改行、半角スペース、全角スペースを削除
        text = re.sub("\r", "", text)
        text = re.sub("\n", "", text)
        text = re.sub("　", "", text)
        text = re.sub(" ", "", text)
        # 数字文字の一律「0」化
        text = re.sub(r"[0-9 ０-９]+", "0", text)  # 数字

        # カンマ、ピリオド以外の記号をスペースに置換
        for p in string.punctuation:
            if (p == ".") or (p == ","):
                continue
            else:
                text = text.replace(p, " ")

        return text

예제 #7

0

파일 보기

파일: preprocess.py 프로젝트: ryosuke1217/web_app_uwsgi

    def _convert_chars(self, chars):
        '''文字列に対して前処理を実施します

    Arguments:
      chars (str): アイテム名
    Returns:
      変換後のアイテム名
    '''
        chars = mojimoji.han_to_zen(chars, digit=False, ascii=False)
        chars = mojimoji.zen_to_han(chars, kana=False).lower()
        chars = ''.join(
            list(filter(lambda c: c not in self.delete_chars, chars)))
        # 1-9までの数字を全て0に変換
        chars = ''.join(
            list(
                map(
                    lambda c: '0'
                    if c in [str(i) for i in range(1, 10)] else c, chars)))
        chars = chars.replace(' ', '')
        return chars

예제 #8

0

파일 보기

        def format_line(base_string):
            """英字をすべて小文字へ, 半角を全角へ、#〜や@~を削除, URLを削除, 日付の削除"""
            small_string = base_string.lower()
            zen_string = mojimoji.zen_to_han(small_string,
                                             digit=False,
                                             kana=False)
            han_string = mojimoji.han_to_zen(zen_string,
                                             digit=False,
                                             ascii=False)
            formatted_string = "".join(c for c in han_string
                                       if c not in emoji.UNICODE_EMOJI)
            patterns = [r"@\w*", r"#(\w+)", r"(http(s)?(:)?//[\w | /]*)"]
            for replace in [
                    "'", '"', ';', '.', ',', '-', '!', '?', '=', "(", ")", "「",
                    "」", "|", "『", "』"
            ]:
                formatted_string = formatted_string.replace(replace, "")

            for pattern in [re.compile(i) for i in patterns]:
                formatted_string = re.sub(pattern, "", formatted_string)
            return formatted_string

예제 #9

0

파일 보기

파일: basic_nlp.py 프로젝트: memicq/FeedAutomator

 def to_wakati(self,
               text,
               allow_word_class=[
                   '名詞', '指示詞', '動詞', '形容詞', '判定詞', '助動詞', '副詞', '助詞',
                   '接続詞', '連体詞', '感動詞', '接頭辞', '特殊', '未定義語'
               ],
               remove_stopwords=False,
               genkei=False):
     wkt = ""
     text = mojimoji.han_to_zen(text)
     rst = self.jumanpp.analysis(text)
     for mrph in rst.mrph_list():
         # midasi, yomi, genkei, hinsi, bunrui, katuyou1, katuyou2, imis, repname
         if remove_stopwords and (mrph.genkei in self.stopwords):
             continue
         if mrph.hinsi in allow_word_class:
             if genkei:
                 wkt += mrph.genkei + ' '
             else:
                 wkt += mrph.midasi + ' '
     return wkt

예제 #10

0

파일 보기

def sentence_splitter(doc):
    # 句点（！？。.）で分割（ただし句点に」』が続く場合(文頭が」』の場合)は前後を繋げる）
    # 「行く事は行くがじき帰る。来年の夏休みにはきっと帰る」　のような場合には分けてしまう
    sent_splitter = nltk.RegexpTokenizer('[^！？。.\n]*[！？。.\n]*')
    sentences = sent_splitter.tokenize(doc)
    sentences = map(lambda s: mojimoji.han_to_zen(s.strip()),
                    filter(None, sentences))
    if '「' not in doc and '」' not in doc:
        return filter(None, sentences)

    prev = ''
    n_sentences = []
    for i, s in enumerate(sentences):
        if s.startswith('」') or s.startswith('』') or s.startswith('）')\
           or s.startswith('】') or s.startswith('”') or s.startswith('\]'):
            prev = prev + s
        else:
            n_sentences.append(prev)
            prev = s
    n_sentences.append(prev)
    return filter(None, map(lambda s: s.strip(), n_sentences))

예제 #11

0

파일 보기

파일: app.py 프로젝트: NLPforCOVID-19/covid-19-api

def update():
    data = request.get_json()

    if data.get("password") != cfg["password"]:
        raise InvalidPassword("The password is not correct")

    updated = db_handler.update_page(
        url=data.get("url"),
        is_hidden=data.get("is_hidden"),
        is_about_covid_19=data.get("is_about_COVID-19"),
        is_useful=data.get("is_useful"),
        is_about_false_rumor=data.get("is_about_false_rumor"),
        is_positive=data.get("is_positive"),
        icountry=data.get("new_displayed_country"),
        etopics=data.get("new_classes"),
        notes=han_to_zen(str(data.get("notes"))),
    )

    log_handler.extend_topic_check_log(
        [json.dumps(updated, ensure_ascii=False)])

    return jsonify(updated)

예제 #12

0

파일 보기

def make_skills_from_charasheet(sheet: str, sl_as_limit: bool) -> List[Skill]:
    # If it seems a entire sheet, drop others
    match_begin = skill_area_begin_regex.search(sheet)
    match_end = skill_area_end_regex.search(sheet)
    if match_begin is not None:
        begin = match_begin.end()
    else:
        begin = 0
    if match_end is not None:
        end = match_end.start()
    else:
        end = len(sheet)
    sheet = sheet[begin:end]

    # Escape slashes like '1/Sn', 'SL/Sr', and so on
    check_set_before = set([str(i) for i in range(20)] + ['sl', 'SL'])
    check_set_after = set(
        ['sn', 'sr', 'Sn', 'Sr', 'SN', 'SR', 'mp', 'MP', 'Mp'])
    for b in check_set_before:
        for a in check_set_after:
            sheet = sheet.replace(f'{b}/{a}', f'{b}{replace_text_slash}{a}')

    # Zenkakify all Kana characters
    sheet = mojimoji.han_to_zen(sheet, digit=False, ascii=False)

    # Check lines
    skills = []
    for line in sheet.split('\n'):
        skill = make_skill_from_text(line, sl_as_limit)
        if skill is not None:
            skills.append(skill)

    # Repair escaped slash
    for skill in skills:
        if skill.usage_limitation is not None:
            skill.usage_limitation = skill.usage_limitation.replace(
                replace_text_slash, '/')

    return skills

예제 #13

0

파일 보기

def clensing(text):
    text = re.sub("\<.+?\>", "", text)
    text = text.lower()
    text = re.sub("\[.+?\]", "", text)
    text = mojimoji.han_to_zen(mojimoji.zen_to_han(text,
                                                   kana=False,
                                                   ascii=False),
                               digit=False)  # 数字だけ半角で、カナとローマ字は全角
    # 同意義語の表記統一
    text = re.sub("ｂｅｓｔ", "ベスト", text)
    text = re.sub("ｓｕｃｃｅｓｓｓｑｉ", "サクセスｓｑｉ", text)
    text = re.sub("ｅｌｓｅ", "ｅｌｓ", text)
    text = re.sub("ｏｐｅｎｅｓ", "エントリーシート", text)
    text = re.sub("ｏｐｅｎ　ｅｓ", "エントリーシート", text)
    text = re.sub("ｏｅｓ", "エントリーシート", text)
    text = re.sub("ｅｓ", "エントリーシート", text)
    text = re.sub("ｓｅ", "システムエンジニア", text)
    text = re.sub("ｇｄ", "グループディスカッション", text)
    text = re.sub("ｈｐ", "ホームページ", text)
    text = re.sub("ピーアール", "ｐｒ", text)
    text = re.sub("ｐｇ", "プログラマー", text)
    text = re.sub("ｇｃ", "ゲームクリエイター", text)
    text = re.sub("ウェブ", "ｗｅｂ", text)
    text = re.sub("コミュニケーション力", "コミュニケーション能力", text)
    text = re.sub("コニニケーション", "コミュニケーション", text)
    text = re.sub("コミュニティーション", "コミュニケーション", text)
    text = re.sub("かんばる", "頑張る", text)
    text = re.sub("がんばる", "頑張る", text)
    text = re.sub("かんばって", "頑張って", text)
    text = re.sub("ｇｐａ", "ｇｐａ ",
                  text)  # gpa3.? の場合に gp a3 で分かち書きされるためにgpaの後に空白追加
    text = re.sub("ｉｔ", "ｉｃｔ", text)  # ictの方が現代の言葉なので表記揺れ回避
    text = mojimoji.zen_to_han(text, kana=False, digit=False)
    # 単語の英字1〜2文字以下の場合は削除する 例：I am student. -> I, am は削除する
    text = re.sub("[ ][a-z]{1,2}[ ]", "", text)
    # ( )で囲まれた部分を削除する 例：<br />
    text = re.sub("\(.+?\)", "", text)
    return text

예제 #14

0

파일 보기

def re_def(filepass):
    nameData = ""
    with codecs.open(filepass, 'r', encoding='utf-8', errors='ignore') as f:
        l = ""
        re_half = re.compile(r'[!-~]')  # 半角記号,数字,英字
        re_full = re.compile(r'[︰-＠]')  # 全角記号
        re_full2 = re.compile(
            r'[、。・’〜：＜＞＿｜「」｛｝【】『』〈〉“”○〇〔〕…――――─◇]')  # 全角で取り除けなかったやつ
        re_url = re.compile(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+')
        re_tag = re.compile(r"<[^>]*?>")  #HTMLタグ
        re_n = re.compile(r'\n')  # 改行文字
        re_space = re.compile(r'[\s+]')  #１以上の空白文字
        re_num = re.compile(r"[0-9]")
        pattern = "(.*)　(.*)"  #全角スペースで分ける
        i = 0
        for line in f:
            if re_num.match(line):  #半角数字は全角数字にする
                line = mojimoji.han_to_zen(line, ascii=False)
            #if '○' in line:
            if line.find('○', 0, 10) == 0:
                if i:
                    yield nameData, l
                    l = ""
                sep = re.search(pattern, line)
                nameData = sep.group(1)
                nameData = nameData.replace("君", "")
                nameData = nameData.replace("○", "")
                line = line.replace(sep.group(1), "")
                i = 1
            line = re_half.sub("", line)
            line = re_full.sub("", line)
            line = re_url.sub("", line)
            line = re_tag.sub("", line)
            line = re_n.sub("", line)
            line = re_space.sub("", line)
            line = re_full2.sub(" ", line)
            l += line
        yield nameData, l

예제 #15

0

파일 보기

파일: sbi_monthly.py 프로젝트: tamata78/stockTool

    def __init__(self):
        self.driver = SeleniumUtils.getChromedriver(__file__)
        self.verificationErrors = []

        # target month setting
        if len(sys.argv) > 2:
            warn_mes = "Sample:\n  python3 sbi_monthly.py [write his memo month]"
            print(warn_mes)
            sys.exit()

        param_han_month = None
        if len(sys.argv) == 2:
            param_han_month = sys.argv[1]
        han_month = str(datetime.datetime.today().month + 1)
        self.month = mojimoji.han_to_zen(
            han_month if param_han_month is None else param_han_month)

        # all member login info
        config = FileUtils.open_file(__file__, "/config.json")
        # "sbib_login_info":{"uid": "user_id", "upa": "user_pass", "uspa": "user_tra_pass"},
        sbib = config["sbib"]
        self.login_info = sbib["sbib_login_info"]
        self.move_money_info = sbib["move_money_info"]

예제 #16

0

파일 보기

파일: format_converter.py 프로젝트: racerandom/PRISMConvert

def extract_txt_from_xls(xls_file, txt_file, split_sent=True, segment=True):
    from pyknp import Juman
    import mojimoji
    juman = Juman()

    json_dict = read_xls(xls_file)
    tmp_file = txt_file if not split_sent else 'tmp.raw'
    with open(tmp_file, 'w', encoding="utf-8") as fo:
        for report in json_dict['読影所見'].values():
            fo.write('%s\n' % report['findings'])
    if split_sent:
        script = '''cat tmp.raw | perl sentence-splitter.pl | python split_tnm.py  > tmp.sent'''
        subprocess.Popen(script, shell=True).wait()
    if segment:
        with open('tmp.sent', 'r', encoding='utf-8') as fi, open(txt_file, 'w', encoding='utf-8') as fo:
            for line in fi:
                unspace_line = ''.join(line.strip().split())
                if not unspace_line:
                    continue
                seg_line = ' '.join([w.midasi for w in juman.analysis(mojimoji.han_to_zen(unspace_line)).mrph_list()])
                fo.write('%s\n' % seg_line)
    os.remove('tmp.raw')
    os.remove('tmp.sent')

예제 #17

0

파일 보기

def zenkaku_hankaku(text):  # カタカナ半角を全角に, 数字英字全角を半角に
    re = mojimoji.zen_to_han(text, kana=False)
    re = mojimoji.han_to_zen(re, digit=False, ascii=False)
    return re

예제 #18

0

파일 보기

    def __init__(self,
                 bert_tokenizer: BertTokenizer,
                 jp_tokenizer: JumanTokenizer,
                 args,
                 file_path='train',
                 block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, 'dialogue_for_nsp' + '_cached_lm_' + str(block_size) +
            '_' + filename)

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples, \
                self.token_type_ids, \
                self.attention_mask, \
                self.next_sentence_label = pickle.load(handle)
        else:
            # キャッシュされたデータファイルがなければテキストファイルからデータセットを作成
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            # [CLS] A A A [SEP] B B B [SEP]
            self.token_type_ids = []
            #   0   0 0 0   0   1 1 1  1
            self.attention_mask = []
            #   1   1 1 1   1   1 1 1  1    0 0 0 0 ...
            self.next_sentence_label = []
            # [0, 1] 0: isNext, 1: notNext
            with open(file_path, encoding="utf-8") as f:
                docs = f.readlines()

            exsamples = []

            ZEN = "".join(chr(0xff01 + i) for i in range(94))
            HAN = "".join(chr(0x21 + i) for i in range(94))

            HAN2ZEN = str.maketrans(HAN, ZEN)

            num_doc = len(docs)
            for idx, line in enumerate(docs):

                text = line.rstrip(os.linesep)

                if text == "":
                    continue
                try:
                    next_text = docs[idx + 1].rstrip(os.linesep)
                except IndexError:
                    continue
                if next_text == "":
                    continue

                if random.random() > args.nsp_swap_ratio:
                    while True:
                        rand_idx = random.randrange(0, num_doc)
                        next_text = docs[rand_idx].rstrip(os.linesep)
                        if (not next_text == "") and (rand_idx != idx + 1):
                            break
                    nsp_label = 1
                    # random sequence
                else:
                    nsp_label = 0
                    # continuation sequence
                # jumanエラー対策
                text = text.replace(' ', '　')
                next_text = next_text.replace(' ', '　')
                text = mojimoji.han_to_zen(text,
                                           kana=False,
                                           digit=True,
                                           ascii=True)
                next_text = mojimoji.han_to_zen(next_text,
                                                kana=False,
                                                digit=True,
                                                ascii=True)
                text = text.translate(HAN2ZEN)
                next_text = next_text.translate(HAN2ZEN)
                # 元テキストを区切った状態に

                if len(text.encode('utf-8')) > 4096 or len(
                        next_text.encode('utf-8')) > 4096:
                    continue

                first_tokenized_text = bert_tokenizer.convert_tokens_to_ids(
                    bert_tokenizer.tokenize(" ".join(
                        jp_tokenizer.tokenize(text))))
                second_tokenized_text = bert_tokenizer.convert_tokens_to_ids(
                    bert_tokenizer.tokenize(" ".join(
                        jp_tokenizer.tokenize(next_text))))

                fst_len = len(first_tokenized_text)
                scd_len = len(second_tokenized_text)
                # for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
                #    self.examples.append(bert_tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
                # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should loook for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                # add special tokens
                # A A A (B B B) ->  [CLS] A A A [SEP] (B B B [SEP])
                added_special = bert_tokenizer.build_inputs_with_special_tokens(
                    token_ids_0=first_tokenized_text,
                    token_ids_1=second_tokenized_text)
                # token type ids
                type_ids = [0] * (2 + fst_len)
                scd_type = [1] * (1 + scd_len)
                type_ids += scd_type

                attention_mask = [1] * len(added_special)

                # Zero-pad up to the sequence length.
                diff = block_size - len(added_special)
                if diff < 0:
                    added_special = added_special[:diff]
                    type_ids = type_ids[:diff]
                    attention_mask = attention_mask[:diff]
                else:
                    padding = [0] * (block_size - len(added_special))
                    padding_1 = [0] * (block_size - len(added_special))
                    padding_2 = [0] * (block_size - len(added_special))
                    added_special += padding
                    type_ids += padding_1
                    attention_mask += padding_2

                assert len(added_special) == block_size
                assert len(type_ids) == block_size
                assert len(attention_mask) == block_size

                self.examples.append(added_special)
                self.token_type_ids.append(type_ids)
                self.attention_mask.append(attention_mask)
                self.next_sentence_label.append(nsp_label)

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            with open(cached_features_file, 'wb') as handle:
                pickle.dump([
                    self.examples, self.token_type_ids, self.attention_mask,
                    self.next_sentence_label
                ],
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)

예제 #19

0

파일 보기

파일: tasks.py 프로젝트: hitochan777/ldc-tool

def h2z(filename):
    with open(filename, "r") as f:
        for line in f:
            print(mojimoji.han_to_zen(line), end="")

예제 #20

0

파일 보기

def juman_tokenize(line, tagger=False):
    return common_tokenize(mojimoji.han_to_zen(line).replace("\u3000", " "))

예제 #21

0

파일 보기

파일: jp_parser.py 프로젝트: hikaruya8/emoline

 def normalize(self, src_text):
     # Zenkaku to Hankaku ( handling japaneze character )
     normalized = mojimoji.han_to_zen(src_text, digit=False, ascii=False)
     normalized = mojimoji.zen_to_han(normalized, kana=False)
     return normalized.lower()

예제 #22

0

파일 보기

파일: haiti_module.py 프로젝트: R-adachi/twitter

def check(target_feature, text_list):
    if (target_feature == 'moji'):
        col = 'mozi'
    else:
        col = target_feature

    l = [list(map(mojimoji.zen_to_han, text_list))]
    model_path = '../model'

    judge_num = len(l[0])
    l.insert(0, [10000 + i for i in range(judge_num)])
    l.append([0 for i in range(judge_num)])
    l_np = np.array(l).T
    columns = ['tweet_id', 'body', col]
    df_judge = pd.DataFrame(data=l_np, columns=columns)

    feature_ds = pd.read_pickle(model_path + '/data/haiti_' + target_feature +
                                '_feature_ds.plk')
    indices = np.loadtxt(model_path + '/data/haiti_' + target_feature +
                         '_indices.csv',
                         delimiter=',')

    m = MeCab.Tagger("")

    # それぞれの文書を取り出して形態素解析
    length_list = []
    text_list = []
    for sentence in df_judge["body"]:
        ma = m.parse(sentence)
        word_list = []
        # 形態解析後の単語だけ抽出
        for text in ma.split("\n"):
            word_list.append(text.split("\t")[0])
        #　単語の数を集計
        length_list.append(len(word_list))
        # 単語の頻度を集計
        data = collections.Counter(word_list)
        text_data = pd.DataFrame.from_dict(data, orient='index')
        # text_data = text_data.fillna(0)
        text_list.append(text_data)

    feature = pd.concat([feature_ds] + text_list, axis=1)
    #Nanを０に置換
    feature = feature.fillna(0)

    ## 各文書に対して全体で頻出の上位k個の単語の出現数をその文書の単語出現数で割ったものを変数とする ##
    modi_feature = []
    for index, row in feature.iloc[indices].T[-judge_num:].reset_index(
            drop=True).iterrows():
        modi_feature_temp = row / length_list[index]
        modi_feature.append(modi_feature_temp)
    modi_feature = pd.concat(modi_feature, axis=1).T
    # 各文書と作成した特徴量を結合
    df_judge_feature = pd.concat([df_judge, modi_feature], axis=1)

    df_judge_feature = df_judge_feature.drop(["tweet_id", "body"], axis=1)

    judge_x = df_judge_feature.drop(col, axis=1)

    judge_x

    model = pickle.load(
        open(model_path + '/haiti_' + target_feature + '_model.pkl', 'rb'))
    judge = model.predict(judge_x)
    judge_list = []
    if (target_feature == 'day'):
        for i in judge:
            if (i > 0.5):
                judge_list.append(2)
            else:
                judge_list.append(1)
    else:
        judge_list = np.argmax(judge, axis=1)
        # print(judge_list)
        if (target_feature == 'moji'):
            all_moji_list = [
                chr(i) for i in range(ord('A'),
                                      ord('Z') + 1)
            ] + [chr(i) for i in range(ord('あ'),
                                       ord('ん') + 1)
                 ] + [chr(i) for i in range(ord('ｱ'),
                                            ord('ﾝ') + 1)]
            judge_moji_num = copy(judge_list)
            judge_list = []
            for j in range(len(judge_moji_num)):
                judge_moji = all_moji_list[judge_moji_num[j]]
                judge_list.append(mojimoji.han_to_zen(judge_moji, ascii=False))
                # print(judge_list)
            # judge_list = list(map(mojimoji.han_to_zen,judge_list,ascii=False))

    return judge_list

예제 #23

0

파일 보기

 def drawItems(self):
     pygame.draw.rect(self.screen, BLACK, Rect(200, 20, 180, 215))
     pygame.draw.rect(self.screen, WHITE, Rect(200, 20, 180, 215), 5)
     for i in range(len(self.player.item_list)):
         drawChar(self.screen, moji.han_to_zen(self.player.item_list[i][1]),
                  240, 20 + i * 40)

예제 #24

0

파일 보기

def download():

    # 郵便番号データ（ローマ字）のダウンロード
    response = requests.get(
        "https://www.post.japanpost.jp/zipcode/dl/roman/ken_all_rome.zip?190712"
    )
    if response.status_code != 200:
        e = Exception(f"HTTP status : {response.status_code}")
        raise e

    # ダウンロードデータのファイル出力
    with open("ken_all_rome.zip", "wb") as file:
        file.write(response.content)

    # zipファイルの解凍
    with zipfile.ZipFile("./ken_all_rome.zip") as zip:
        zip.extractall("./")

    rome_dic = {}
    rome_file_path = "./KEN_ALL_ROME.CSV"
    with codecs.open(rome_file_path, "r", encoding="shift-jis") as rome:
        reader = csv.reader(rome)

        for row in reader:
            zip_code = row[0]
            prefecture_name = row[1]
            city_name = row[2].replace("　", "")
            town_name = re.sub(r'^(.*?)(（.*)?$', r'\1',
                               row[3]).replace("　",
                                               "").replace("以下に掲載がない場合", "")
            prefecture_rome_name = row[4]
            city_rome_name = row[5].replace(" ", "-").lower()
            town_rome_name = re.sub(r'^(.*?)(\(.*)?$', r'\1', row[6]).replace(
                " ", "").replace("IKANIKEISAIGANAIBAAI", "").lower()

            rome_dic[f"{prefecture_name},{city_name},,"] = {
                "prefecture_name": prefecture_rome_name,
                "city_name": city_rome_name,
                "town_name": ""
            }
            rome_dic[f"{prefecture_name},{city_name},{town_name},"] = {
                "prefecture_name": prefecture_rome_name,
                "city_name": city_rome_name,
                "town_name": town_rome_name
            }

    # 郵便番号データのダウンロード
    response = requests.get(
        "https://www.post.japanpost.jp/zipcode/dl/kogaki/zip/ken_all.zip")
    if response.status_code != 200:
        e = Exception(f"HTTP status : {response.status_code}")
        raise e

    # ダウンロードデータのファイル出力
    with open("ken_all.zip", "wb") as file:
        file.write(response.content)

    # zipファイルの解凍
    with zipfile.ZipFile("./ken_all.zip") as zip:
        zip.extractall("./")

    # 文字コード変換(shift-jis -> utf-8)
    src_file_path = "./KEN_ALL.CSV"
    dest_file_path = "./KEN_ALL_UTF8.CSV"

    kana_dic = {}
    with codecs.open(src_file_path, "r",
                     encoding="shift-jis") as src, codecs.open(
                         dest_file_path, "w", encoding="utf-8") as dest:
        reader = csv.reader(src)
        area_code = ""
        zip_code = ""
        city_kana_name = ""
        town_kana_name = ""
        prefecture_name = ""
        city_name = ""
        town_name = ""
        town_short_name = ""
        town_ext_name = ""
        city_rome_name = ""
        town_rome_name = ""
        town_duplicate_flag = ""
        town_multi_flag = ""
        zip_code_branch_no = {}
        same_zip_code = False
        rows = []
        building_flag = 0
        exclude_building = False
        building_prefix = ""
        building_kana_prefix = ""

        for row in reader:
            if zip_code.replace("-", "") == row[2] and ("（" in town_name) and (
                    not town_name.endswith("）")):
                same_zip_code = True
            else:
                same_zip_code = False

            if not same_zip_code and zip_code:

                if town_name == "以下に掲載がない場合":
                    town_short_name = ""
                    town_ext_name = town_name
                    town_short_kana_name = ""
                    town_ext_kana_name = town_kana_name
                else:
                    town_short_name = re.sub(r"（.*）", "", town_name)
                    town_ext_name = town_name.replace(town_short_name, "", 1)
                    town_short_kana_name = re.sub(r"（.*）", "", town_kana_name)
                    town_ext_kana_name = town_kana_name.replace(
                        town_short_kana_name, "", 1)

                if town_ext_name == "（次のビルを除く）":
                    exclude_building = True
                    building_flag = 0
                    town_duplicate_flag = 1
                    building_prefix = town_short_name
                    building_kana_prefix = town_short_kana_name
                elif exclude_building and town_short_name.startswith(
                        building_prefix) and town_ext_name != "":
                    building_flag = 1
                    town_duplicate_flag = 1
                    town_ext_name = f"{town_short_name}{town_ext_name}".replace(
                        building_prefix, "", 1)
                    town_short_name = building_prefix
                    town_ext_kana_name = f"{town_short_kana_name}{town_ext_kana_name}".replace(
                        building_kana_prefix, "", 1)
                    town_short_kana_name = building_kana_prefix
                else:
                    building_flag = 0
                    exclude_building = False

                if zip_code in zip_code_branch_no:
                    zip_code_branch_no[zip_code] += 1
                else:
                    zip_code_branch_no[zip_code] = 1

                key = f"{prefecture_name},{city_name},{town_short_name},"
                if key in rome_dic:
                    city_rome_name = rome_dic[key]["city_name"]
                    town_rome_name = rome_dic[key]["town_name"]
                else:
                    city_rome_name = ""
                    town_rome_name = ""

                kana_dic[f"{prefecture_name},{city_name},,"] = {
                    "city_name": city_kana_name,
                    "town_name": ""
                }
                kana_dic[
                    f"{prefecture_name},{city_name},{town_short_name},"] = {
                        "city_name": city_kana_name,
                        "town_name": town_short_kana_name
                    }
                dest.write(
                    f"{zip_code},{zip_code_branch_no[zip_code]},{area_code},{prefecture_name},{city_name},{city_kana_name},{city_rome_name},{town_short_name},{town_short_kana_name},{town_rome_name},{town_ext_name},{town_ext_kana_name},{town_duplicate_flag},{building_flag}\n"
                )

            area_code = row[0]  # 全国地方公共団体コード
            # xxx = row[1] # 旧郵便番号5桁
            zip_code = re.sub(r'([0-9]{3})([0-9]{4})', r'\1-\2',
                              row[2])  # 郵便番号
            # xxx = row[3] # 都道府県名（半角カタカナ）
            city_kana_name = mojimoji.han_to_zen(row[4])  # 市区町村名（半角カタカナ）
            town_kana_name = row[
                5] if not same_zip_code else f"{town_kana_name}{row[5]}"  # 町域名（半角カタカナ）
            town_kana_name = mojimoji.han_to_zen(town_kana_name)
            prefecture_name = row[6]  # 都道府県名
            city_name = row[7]  # 市区町村名
            town_name = row[
                8] if not same_zip_code else f"{town_name}{row[8]}"  # 町域
            town_name = town_name.replace("−", "-").replace("〜", "～")
            town_duplicate_flag = row[
                9]  # 一町域が二以上の郵便番号で表される場合の表示　（注3）　（「1」は該当、「0」は該当せず）
            # xxx = row[10] # 小字毎に番地が起番されている町域の表示　（注4）　（「1」は該当、「0」は該当せず）
            # xxx = row[11] # 丁目を有する町域の場合の表示　（「1」は該当、「0」は該当せず）
            town_multi_flag = row[
                12]  # 一つの郵便番号で二以上の町域を表す場合の表示　（注5）　（「1」は該当、「0」は該当せず）
            # xxx = row[13] # 更新の表示（注6）（「0」は変更なし、「1」は変更あり、「2」廃止（廃止データのみ使用））
            # xxx = row[14] # 変更理由　（「0」は変更なし、「1」市政・区政・町政・分区・政令指定都市施行、「2」住居表示の実施、「3」区画整理、「4」郵便区調整等、「5」訂正、「6」廃止（廃止データのみ使用））

        town_short_name = re.sub(r"（.*）", "", town_name)
        town_ext_name = town_name.replace(town_short_name, "", 1)
        town_short_kana_name = re.sub(r"（.*）", "", town_kana_name)
        town_ext_kana_name = town_kana_name.replace(town_short_kana_name, "",
                                                    1)

        if exclude_building and town_short_name.startswith(
                building_prefix) and town_ext_name != "":
            building_flag = 1
            town_duplicate_flag = 1
            town_ext_name = f"{town_short_name}{town_ext_name}".replace(
                building_prefix, "", 1)
            town_short_name = building_prefix
            town_ext_kana_name = f"{town_short_kana_name}{town_ext_kana_name}".replace(
                building_kana_prefix, "", 1)
            town_short_kana_name = building_kana_prefix
        else:
            building_flag = 0
            exclude_building = False

        if zip_code in zip_code_branch_no:
            zip_code_branch_no[zip_code] += 1
        else:
            zip_code_branch_no[zip_code] = 1

        key = f"{prefecture_name},{city_name},{town_short_name},"
        if key in rome_dic:
            city_rome_name = rome_dic[key]["city_name"]
            town_rome_name = rome_dic[key]["town_name"]
        else:
            city_rome_name = ""
            town_rome_name = ""

        kana_dic[f"{prefecture_name},{city_name},,"] = {
            "city_name": city_kana_name,
            "town_name": ""
        }
        kana_dic[f"{prefecture_name},{city_name},{town_short_name},"] = {
            "city_name": city_kana_name,
            "town_name": town_short_kana_name
        }
        dest.write(
            f"{zip_code},{zip_code_branch_no[zip_code]},{area_code},{prefecture_name},{city_name},{city_kana_name},{city_rome_name},{town_short_name},{town_short_kana_name},{town_rome_name},{town_ext_name},{town_ext_kana_name},{town_duplicate_flag},{building_flag}\n"
        )

    # 郵便番号データ（大口事業所）のダウンロード
    response = requests.get(
        "https://www.post.japanpost.jp/zipcode/dl/jigyosyo/zip/jigyosyo.zip")
    if response.status_code != 200:
        e = Exception(f"HTTP status : {response.status_code}")
        raise e

    # ダウンロードデータのファイル出力
    with open("jigyosyo.zip", "wb") as file:
        file.write(response.content)

    # zipファイルの解凍
    with zipfile.ZipFile("./jigyosyo.zip") as zip:
        zip.extractall("./")

    # 文字コード変換(shift-jis -> utf-8)
    src_file_path = "./JIGYOSYO.CSV"
    dest_file_path = "./JIGYOSYO_UTF8.CSV"

    with codecs.open(src_file_path, "r", encoding="cp932") as src, codecs.open(
            dest_file_path, "w", encoding="utf-8") as dest:
        reader = csv.reader(src)
        area_code = ""
        office_name = ""
        office_kana_name = ""
        zip_code = ""
        prefecture_name = ""
        city_name = ""
        city_kana_name = ""
        city_rome_name = ""
        town_name = ""
        town_kana_name = ""
        town_ext_name = ""
        town_rome_name = ""
        office_flag = 0
        post_office_box_flag = 0
        zip_code_branch_no = {}
        rows = []

        for row in reader:

            area_code = row[0]  # 全国地方公共団体コード
            office_kana_name = mojimoji.han_to_zen(row[1])  # 大口事業所名（カナ）
            office_name = row[2]  # 大口事業所名（漢字）
            prefecture_name = row[3]  # 都道府県名
            city_name = row[4]  # 市区町村名
            town_name = row[5]  # 町域名
            town_kana_name = ""
            town_rome_name = ""
            town_ext_name = row[6]  # 小字名、丁目、番地等
            town_ext_kana_name = ""
            zip_code = re.sub(r'([0-9]{3})([0-9]{4})', r'\1-\2',
                              row[7])  # 郵便番号
            # xxx = row[8] # 旧郵便番号5桁
            # xxx = row[9] # 取扱局
            office_flag = 1 if row[10] == "0" else 0  # 「0」大口事業所、「1」私書箱
            post_office_box_flag = 1 if row[10] == "1" else 0  # 「0」大口事業所、「1」私書箱
            # xxx = row[11] # 複数番号の有無
            # xxx = row[12] # 修正コード

            #city_kana_name = row[4] # 市区町村名（半角カタカナ）
            #town_kana_name = row[5] if not same_zip_code else f"{town_kana_name}{row[5]}" # 町域名（半角カタカナ）

            if zip_code in zip_code_branch_no:
                zip_code_branch_no[zip_code] += 1
            else:
                zip_code_branch_no[zip_code] = 1

            key = f"{prefecture_name},{city_name},{town_name},"
            if key == "東京都,千代田区,猿楽町,":
                town_name = "神田猿楽町"
            elif key == "東京都,千代田区,三崎町,":
                town_name = "神田三崎町"

            key = f"{prefecture_name},{city_name},{town_name},"
            if key in rome_dic:
                city_rome_name = rome_dic[key]["city_name"]
                town_rome_name = rome_dic[key]["town_name"]
            else:
                town_name_exclude_aza = ""
                if "ケ" in town_name:
                    town_name_exclude_aza = town_name.replace("ケ", "ヶ")
                elif "ヶ" in town_name:
                    town_name_exclude_aza = town_name.replace("ヶ", "ケ")
                elif "字" in town_name:
                    town_name_exclude_aza = re.sub(r"(大)?字", "", town_name)
                elif "通" in town_name:
                    town_name_exclude_aza = re.sub(r"(.*?)[０１２３４５６７８９].*",
                                                   r"\1", town_ext_name)

                key = f"{prefecture_name},{city_name},{town_name_exclude_aza},"
                if town_name_exclude_aza and key in rome_dic:
                    city_rome_name = rome_dic[key]["city_name"]
                    if "通" in town_name:
                        town_rome_name = ""
                    else:
                        town_rome_name = rome_dic[key]["town_name"]
                else:
                    key = f"{prefecture_name},{city_name},,"
                    if key in rome_dic:
                        city_rome_name = rome_dic[key]["city_name"]
                        town_rome_name = rome_dic[key]["town_name"]
                    else:
                        city_rome_name = ""
                        town_rome_name = ""

            key = f"{prefecture_name},{city_name},{town_name},"
            if key in kana_dic:
                city_kana_name = kana_dic[key]["city_name"]
                town_kana_name = kana_dic[key]["town_name"]
            else:
                town_name_exclude_aza = ""
                if "ケ" in town_name:
                    town_name_exclude_aza = town_name.replace("ケ", "ヶ")
                elif "ヶ" in town_name:
                    town_name_exclude_aza = town_name.replace("ヶ", "ケ")
                elif "字" in town_name:
                    town_name_exclude_aza = re.sub(r"(大)?字", "", town_name)
                elif "通" in town_name:
                    town_name_exclude_aza = re.sub(r"(.*?)[０１２３４５６７８９].*",
                                                   r"\1", town_ext_name)

                key = f"{prefecture_name},{city_name},{town_name_exclude_aza},"
                if town_name_exclude_aza and key in kana_dic:
                    city_kana_name = kana_dic[key]["city_name"]
                    if "通" in town_name:
                        town_ext_kana_name = kana_dic[key]["town_name"]
                    else:
                        town_kana_name = kana_dic[key]["town_name"]
                else:
                    key = f"{prefecture_name},{city_name},,"
                    if key in kana_dic:
                        city_kana_name = kana_dic[key]["city_name"]
                        town_kana_name = kana_dic[key]["town_name"]
                    else:
                        city_kana_name = ""
                        town_kana_name = ""

            dest.write(
                f"{zip_code},{zip_code_branch_no[zip_code]},{area_code},{prefecture_name},{city_name},{city_kana_name},{city_rome_name},{town_name},{town_kana_name},{town_rome_name},{town_ext_name},{town_ext_kana_name},{office_name},{office_kana_name},{office_flag},{post_office_box_flag}\n"
            )

예제 #25

0

파일 보기

 def preprocess(self, text):
     text = re.sub("&[^;]+;", " ", text)
     text = mojimoji.han_to_zen(text, digit=False)
     # text = re.sub('(\s|　|＃)+', " ", text)
     return text

예제 #26

0

파일 보기

파일: convertChinesePartToZenkaku.py 프로젝트: hitochan777/ldc-tool

#!/usr/bin/env python3

import sys
import argparse

import mojimoji

"""
"""

if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="input file")
    args = parser.parse_args()
    with open(args.input,"r") as i:
        for line in i:
            line = line.strip()
            if line.startswith("zh:"):
                result = []
                tokens = line.split(" ")
                for token in tokens[1:]:
                    result.append(mojimoji.han_to_zen(token))

                print("zh: %s" % (" ".join(result)))

            else:
                print(line)

예제 #27

0

파일 보기

파일: tokenizers.py 프로젝트: laboroai/Laboro-BERT-Japanese

def tokenize_jumandic(text):
    text = mojimoji.han_to_zen(text).replace('\u3000', ' ')
    return tagger_jumandic.parse(text).rstrip('\n')

예제 #28

0

파일 보기

파일: data_factory.py 프로젝트: yuta-portfolio/portfolio

def make_stopwords():
    u"""コピペ用ストップワードを作成して表示

    """
    import mojimoji
    import cnvk
    stopwords=set()
    hira=u"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもらりるれろやゐゆゑよわをんがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽぁぃぅぇぉゃゅょっゔ"
    kata=[]
    for h in hira:
        kata.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA))
    kata.append(u"ヴ")
    hankata=[]
    for k in kata:
        hankata.append(mojimoji.zen_to_han(k))
    kazu=u"0123456789"
    stopwords.add(u"10")
    stopwords.add(u"11")
    stopwords.add(u"12")
    stopwords.add(u"13")
    stopwords.add(u"14")
    stopwords.add(u"15")
    stopwords.add(u"16")
    stopwords.add(u"17")
    stopwords.add(u"18")
    stopwords.add(u"19")
    stopwords.add(u"20")
    stopwords.add(u"１０")
    stopwords.add(u"１１")
    stopwords.add(u"１２")
    stopwords.add(u"１３")
    stopwords.add(u"１４")
    stopwords.add(u"１５")
    stopwords.add(u"１６")
    stopwords.add(u"１７")
    stopwords.add(u"１８")
    stopwords.add(u"１９")
    stopwords.add(u"２０")
    zenkazu=mojimoji.han_to_zen(kazu)
    kazukan=u"一二三四五六七八九十百千万億兆"
    minialpha=u"abcdefghijklmnopqlstuvwxyz"
    bigalpha=u"ABCDEFGHIJKLMNOPQLSTUVWXYZ"
    han_minialpha=mojimoji.han_to_zen(minialpha)
    han_bigalpha=mojimoji.han_to_zen(bigalpha)
    hiramoji=[u"する",u"なる",u"てる",u"れる",u"やる",u"いる",u"さん",u"なん",u"くん",u"それ",u"こと",\
              u"ちゃん",u"ある",u"これ",u"して",u"くれる",u"くださる",u"そう",u"せる",u"した",u"いか",\
              u"ので",u"よう",u"てるん",u"もん",u"られる",u"あそこ",u"あたり",u"あちら",u"あっち",u"あと",\
              u"あな",u"あなた",u"あれ",u"いくつ",u"いつ",u"いま",u"いろいろ",u"うち",u"おおまか",u"おまえ",u"おれ",
              u"がい",u"かく",u"かたちの",u"かやの",u"から",u"がら",u"きた",u"こせ",u"ここ",u"こっち",u"こと",u"ごと",\
              u"こちら",u"これ",u"これら",u"ごろ",u"さまざま",u"さらい",u"しかた",u"しよう",u"すか",u"ずつ",u"すね",\
              u"そう",u"そこ",u"そちら",u"そっち",u"そで",u"それ",u"それぞれ",u"それなり",u"たくさん",u"たち",u"たび",\
              u"ため",u"ちゃ",u"てん",u"とおり",u"とき",u"どこ",u"どこか",u"ところ",u"どちら",u"どれ",u"なか",u"なかば",\
              u"なに",u"など",u"なん",u"はじめ",u"はず",u"はるか",u"ひと",u"ひとつ",u"ふく",u"ぶり",u"べつ",u"へん",u"べん",\
              u"ほう",u"ほか",u"まさ",u"まし",u"まとも",u"まま",u"みたい",u"みつ",u"みなさん",u"みんな",u"もと",u"もの",\
              u"もん",u"やつ",u"よう",u"よそ",u"わけ",u"わたし",u"くる",u"すぎる",u"れる",u"いう",u"くださる",u"ちゃう",\
              u"つく",u"せる",u"てるん",u"すぎ",u"ところ",u"おれ",u"ぼく",u"わたし",u"てる",u"しまう",u"みる",
              ]

    katamoji=[]
    for h in hiramoji:
        katamoji.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA))
    han_katamoji=[]
    for k in katamoji:
        han_katamoji.append(mojimoji.zen_to_han(k))

    kanmoji=["笑","今","気","今日","明日","方","人","俺","私","僕","時","思う","行く","言う","見る","出す","年","月","日","分","秒","週","火","水","木","金","土","国","都",\
             "道","府","県","市","区","町","村","各","第","何","的","度","達","誰","者","類","用","別","等","際","系","品","化","所","毎","回","匹","個","席","束","歳","円","毎",\
             "前","後","左","右","次","先","春","夏","秋","冬","下記","上記","時間","今回","前回","場合","自分","ヶ所","ヵ所","カ所","箇所","ヶ月","カ月","箇月","名前","本当","確か","時点",\
             "様々","結局","半ば","以前","以後","以降","未満","以上","以下","毎日","自体","何人","手段","感じ","同じ","点","君"]

    h_kigou=cnvk.H_KIGO
    kigou=[]
    for h in h_kigou:
        for x in h:
            kigou.append(x)
    kigou.append(u"ω")
    kigou.append(u'ー')
    kigou.append(u"д")

    #参考 内容推測に適したキーワード抽出のための日本語ストップワード(https://www.jstage.jst.go.jp/article/jjske/12/4/12_511/_pdf)
    kokubu_words=[u"ない",u"高い",u"多い",u"少ない","強い","大きい","小さい","長い","ながい",
                  u"良い",u"よい",u"いい","悪い",
                  u"ある","いる","なる","行く","いく","来る","とる",
                  "見る","みる","言う","いう","得る","過ぎる","すぎる",
                  "する","やる","行なう","行う","おこなう","出来る","できる",
                  "おもう","思う","考える","かんがえる","わかる","見える",
                  "知る","しれる","いえる","示す","述べる","書く","かく","よる",
                  "異なる","違う","ちがう","くらべる",
                  "入れる","出る","でる","入る","はいる",
                  "使う","用いる","もちいる","持つ","もつ","作る","つくる",
                  "なす","起こる","おこる","つく","つける","聞く","よぶ",
                  "かれる","つまり","上","下","次","つぎ",
                  "わが国","自分","人々","人びと","別","他","間","話","例","形","日","家","手","名","身",
                  "そのもの","一つ","あと",

                  #2016/01/24 更に偏在度の高いものと、忘れてたひらがなを追加
                  "きゃ","きゅ","きょ","しゃ","しゅ","しょ","ちゃ","ちゅ","ちょ","にゃ","にゅ","にょ",
                  "ひゃ","ひゅ","ひょ","みゃ","みゅ","みょ","りゃ","りゅ","りょ","ゎ",
                  "事","目","とこ","中","字","お前","全部","きみ","もらう",
                  ]

    for h in hira:
        stopwords.add(h)
    for k in kata:
        stopwords.add(k)
    for h in hankata:
        stopwords.add(h)
    for k in kazu:
        stopwords.add(k)
    for z in zenkazu:
        stopwords.add(z)
    for k in kazukan:
        stopwords.add(k)
    for m in minialpha:
        stopwords.add(m)
    for b in bigalpha:
        stopwords.add(b)
    for h in han_minialpha:
        stopwords.add(h)
    for h in han_bigalpha:
        stopwords.add(h)
    for h in hiramoji:
        stopwords.add(h)
    for k in katamoji:
        stopwords.add(k)
    for h in han_katamoji:
        stopwords.add(h)
    for k in kanmoji:
        stopwords.add(unicode(k))
    for k in kigou:
        stopwords.add(k)
    for k in kokubu_words:
        stopwords.add(unicode(k))
    print "set([",
    for s in sorted(stopwords):
        print "u\"{0}\",".format(s),
    print "])"

예제 #29

0

파일 보기

파일: japanese.py 프로젝트: vowtk0123/cotoba-agent-oss

 def zenhan_normalize(texts):
     han_texts = mojimoji.zen_to_han(texts, kana=False)
     zen_texts = mojimoji.han_to_zen(han_texts, digit=False, ascii=False)
     return zen_texts

예제 #30

0

파일 보기

파일: test_textformat_2.py 프로젝트: higenobu/sentiment

import sys
import csv
import mojimoji

args = sys.argv

with open(args[1]) as f:
    reader = csv.reader(f, delimiter='\t')
    ls = [row for row in reader]

    result_ls = []

    for l in ls:
        #        print(mojimoji.han_to_zen(l[0],ascii=False))
        #        print(mojimoji.han_to_zen(l[0]))
        result_strs = mojimoji.han_to_zen(l[0])

        result_ls.append(result_strs)

#    print(result_ls)
#    print(result_ls[0])

    result_rows = []

    for i in range(len(result_ls)):
        #        print('[{}][{}]'.format(result_ls[i],ls[i][1]))
        result_rows.append([result_ls[i], ls[i][1]])

    print(result_rows)
#    print(result_rows[0])

예제 #31

0

파일 보기

    def update(self):
        self.count += 1
        pygame.draw.rect(self.screen, BLACK, Rect(50, 20, 140, 140))
        pygame.draw.rect(self.screen, WHITE, Rect(50, 20, 140, 140), 5)
        commands = ["つよさ", "どうぐ", "じゅもん"]
        statas_list = [
            "ＬＶ　　　：", "ＨＰ　　　：", "ＭＰ　　　：", "ちから　　：", "みのまもり：", "ＥＸ　　　："
        ]
        statas_player_list = [
            self.player.lv, self.player.hp, self.player.mp, self.player.attack,
            self.player.defence, self.player.exp
        ]
        for i in range(len(commands)):
            drawChar(self.screen, commands[i], 80, 20 + i * 40)

        self.drawStaGol()

        self.drawTri(60, 39 + self.menu_select_num * 40, self.count)
        if self.show_statas:
            self.menu_select_num = 0
            pygame.draw.rect(self.screen, BLACK, Rect(200, 20, 255, 250))
            pygame.draw.rect(self.screen, WHITE, Rect(200, 20, 255, 250), 5)
            for i in range(len(statas_list)):
                drawChar(self.screen, statas_list[i], 220, 20 + i * 40)
            for j in range(len(statas_player_list)):
                drawChar(self.screen,
                         moji.han_to_zen(str(statas_player_list[j])), 340,
                         20 + j * 40)
        if self.show_items:
            self.menu_select_num = 1
            self.drawItems()
        if self.item_select_tri:
            self.drawTri(215, 37 + self.item_select_num * 40, self.count)
        if self.use_item_anim:
            drawText(
                self.screen,
                "ゆうしゃは　" + moji.han_to_zen(str(self.use_item[1])) + "を　つかった！",
                "ゆうしゃの　キズが　かいふくした！", "", "")
        if self.show_magics:
            self.menu_select_num = 2
            pygame.draw.rect(self.screen, BLACK, Rect(200, 20, 160, 180))
            pygame.draw.rect(self.screen, WHITE, Rect(200, 20, 160, 180), 5)
            magic_count = len(self.player.magic_list)
            if magic_count < 4:
                for i in range(magic_count):
                    drawChar(self.screen, self.player.magic_list[i][1], 242,
                             28 + 37 * i)
            else:
                for i in range(4):
                    if (self.magic_arrow_num < 4):
                        drawChar(self.screen, self.player.magic_list[i][1],
                                 242, 28 + 37 * i)
                    else:
                        drawChar(
                            self.screen,
                            self.player.magic_list[self.magic_arrow_num - 3 +
                                                   i][1], 242, 28 + 37 * i)
        if self.magic_select_tri:
            self.magic_arrow_max_num = len(self.player.magic_list) - 1
            if (self.magic_arrow_num > 3):
                self.magic_arrow_num_correct = 3
            else:
                self.magic_arrow_num_correct = self.magic_arrow_num
            self.drawTri(215, 48 + self.magic_arrow_num_correct * 37,
                         self.count)
            self.player.selected_magic = self.player.magic_list[
                self.magic_arrow_num][1]
            self.player.selected_magic_mp = int(
                self.player.magic_list[self.magic_arrow_num][2])
            self.player.selected_magic_damage = int(
                self.player.magic_list[self.magic_arrow_num][3])
            self.player.selected_magic_heal = int(
                self.player.magic_list[self.magic_arrow_num][4])
        if self.use_magic_anim:
            drawText(
                self.screen, "ゆうしゃは　" +
                moji.han_to_zen(str(self.player.selected_magic)) + "を　となえた！",
                "ゆうしゃの　キズが　かいふくした！", "", "")

예제 #32

0

파일 보기

파일: japanese_tokenizer.py 프로젝트: futakw/Twitter_Image_Captioning

 def _normalize_kana(self, text):
     return han_to_zen(text, digit=False, ascii=False)

예제 #33

0

파일 보기

파일: zenhanexchange.py 프로젝트: koichi-ezato/PythonSample

r = mojimoji.zen_to_han(zenAll, digit = False)
print unicode_to_utf8(r)

# 全角アスキー文字以外の全角文字を全て半角に変換
r = mojimoji.zen_to_han(zenAll, ascii = False)
print unicode_to_utf8(r)
print '\r\n----- 全角→半角変換 -----\r\n'

# 半角文字を全て全角文字に変換
print '----- 半角→全角変換 -----\r\n'
print 'target:ｱｲｳabc012\r\n'

hanAll = u'ｱｲｳabc012'

# 半角文字を全て全角文字に変換
r = mojimoji.han_to_zen(hanAll)
print unicode_to_utf8(r)

# 半角カナ以外の半角文字を全て全角に変換
r = mojimoji.han_to_zen(hanAll, kana = False)
print unicode_to_utf8(r)

# 半角数字以外の半角文字を全て全角に変換
r = mojimoji.han_to_zen(hanAll, digit = False)
print unicode_to_utf8(r)

# 半角アスキー文字以外の半角文字を全て全角に変換
r = mojimoji.han_to_zen(hanAll, ascii = False)
print unicode_to_utf8(r)
print '\r\n----- 半角→全角変換 -----\r\n'

예제 #34

0

파일 보기

    def battleAnim(self, screen):
        self.battle_anim_count += 1
        self.screen = screen
        if (self.battle_anim_count < 20):

            if ((self.battle_anim_count / 5) % 2):
                self.screen.fill(BLACK)
            else:
                self.screen.fill(WHITE)
        else:
            self.screen.fill(BLACK)
            self.drawMonster()

        if (self.battle_anim_count > 40 and self.battle_anim_count < 80):
            self.drawText(str(self.monster.name) + "が　あらわれた。", "", "", "")

        if (self.battle_anim_count > 80):
            self.drawStatas()

        if (self.battle_anim_count == 80):
            self.command_selecting = True
            self.command_select_tri = True
            self.magic_select_tri = True

        if (self.command_selecting):
            self.drawComand()
            self.drawMonsterList()

            if (self.command_select_tri):
                if (self.command_arrow_num > 3):
                    self.command_arrow_num_correct = 3
                else:
                    self.command_arrow_num_correct = self.command_arrow_num
                self.drawTri(65, 307 + self.command_arrow_num_correct * 37)

        if (self.magic_selecting):
            self.drawMagicList()
            if (self.magic_select_tri):
                self.magic_arrow_max_num = len(self.player.magic_list) - 1
                if (self.magic_arrow_num > 3):
                    self.magic_arrow_num_correct = 3
                else:
                    self.magic_arrow_num_correct = self.magic_arrow_num
                self.drawTri(205, 307 + self.magic_arrow_num_correct * 37)
                self.player.selected_magic = self.player.magic_list[
                    self.magic_arrow_num][1]
                self.player.selected_magic_mp = int(
                    self.player.magic_list[self.magic_arrow_num][2])
                self.player.selected_magic_damage = int(
                    self.player.magic_list[self.magic_arrow_num][3])
                self.player.selected_magic_heal = int(
                    self.player.magic_list[self.magic_arrow_num][4])

        if (self.monster_selecting):
            self.drawTri(235, 307)

        if self.item_selecting:
            pygame.draw.rect(self.screen, BLACK, Rect(210, 245, 180, 215))
            pygame.draw.rect(self.screen, WHITE, Rect(210, 245, 180, 215), 5)
            for i in range(len(self.player.item_list)):
                drawChar(self.screen,
                         moji.han_to_zen(self.player.item_list[i][1]), 240,
                         243 + i * 40)

        if self.item_select_tri:
            self.drawTri(220, 262 + self.item_select_num * 40)

        if self.use_item_anim:
            drawText(
                self.screen,
                "ゆうしゃは　" + moji.han_to_zen(str(self.use_item[1])) + "を　つかった！",
                "ゆうしゃの　キズが　かいふくした！", "", "")

        if (self.guard_anim):
            self.drawText("ゆうしゃは　みをまもっている。", "", "", "")

        if (self.escape_success == 1):
            self.drawText("ゆうしゃは　にげだした！", "", "", "")

        if (self.escape_success == 2):
            self.drawText("ゆうしゃは　にげだした！", "しかし　まわりこまれてしまった！", "", "")

        if (self.attack_player_anim):
            self.drawText("ゆうしゃの　こうげき！", str(self.damage) + "の　ダメージ！", "", "")

        if (self.magic_player_anim):
            if self.player.selected_magic_heal == 0:
                self.drawText(
                    "ゆうしゃは　" +
                    self.player.magic_list[self.player.selected_magic][1] +
                    "を　となえた！",
                    str(self.monster.name) + "に　" +
                    str(self.player.selected_magic_damage) + "の　ダメージ！", "", "")
            else:
                self.drawText(
                    "ゆうしゃは　" +
                    self.player.magic_list[self.player.selected_magic][1] +
                    "を　となえた！", "ゆうしゃの　きずが　かいふくした！", "", "")

        if (self.attack_monster_anim):
            self.drawText(
                str(self.monster.name) + "の　こうげき！",
                "ゆうしゃに　" + str(self.damage) + "の　ダメージ！", "", "")

        if (self.you_defeate):
            self.drawText(
                str(self.monster.name) + "を　やっつけた。",
                str(self.monster.exp) + "ポイントの　けいけんちを　かくとく。",
                str(self.monster.gold) + "ゴールドを　てにいれた。", "")

        if (self.lv_up_anim):
            self.drawText(
                "ゆうしゃは　レベルが　あがった！",
                "ちからが　" + str(self.player.lv_tables[self.player.lv][3] -
                              self.player.lv_tables[self.player.lv - 1][3]) +
                "ポイント　あがった！",
                "みのまもりが　" + str(self.player.lv_tables[self.player.lv][4] -
                                self.player.lv_tables[self.player.lv - 1][4]) +
                "ポイント　あがった！", "さいだいHPが　" +
                str(self.player.lv_tables[self.player.lv][1] -
                    self.player.lv_tables[self.player.lv - 1][1]) +
                "ポイント　あがった！")

        if (self.lv_up_anim_2):
            self.drawText(
                "さいだいMPが　" +
                str(self.player.lv_tables[self.player.lv][2] -
                    self.player.lv_tables[self.player.lv - 1][2]) +
                "ポイント　あがった！", "", "", "")

        if (self.you_lose):
            self.drawText("ゆうしゃは　しんでしまった！", "しょじきんが　はんぶんになった。", "", "")

예제 #35

0

파일 보기

파일: normalizer.py 프로젝트: sasano8/ja_text_cleaner

 def process(cls, v: str):
     v = mojimoji.han_to_zen(v, kana=True, ascii=True, digit=True)
     v = re.sub(" +", "　", v)
     return v

예제 #36

0

파일 보기

파일: calc_user_vec.py 프로젝트: hcpmiyuki/odegather-api-

 def cleansing_unity(self, text):
     text = text.lower()
     text = mojimoji.zen_to_han(text, kana=True)
     text = mojimoji.han_to_zen(text, digit=False, ascii=False)
     return text

예제 #37

0

파일 보기

 def drawText(self, text1, text2, text3, text4):
     pygame.draw.rect(self.screen, WHITE, Rect(70, 280, 500, 180), 5)
     self.drawChar(moji.han_to_zen(text1), 90, 290)
     self.drawChar(moji.han_to_zen(text2), 90, 323)
     self.drawChar(moji.han_to_zen(text3), 90, 356)
     self.drawChar(moji.han_to_zen(text4), 90, 389)