Exemplo n.º 1
0
def test_cnvk():
    logging.info("=========================================")
    logging.info("=                 cnvk                  =")
    logging.info("=========================================")
    test_cases = get_test_cases()
    for tc in test_cases:
        title = tc['title']
        body = tc['body']

        logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title)
        calc_time(cnvk.convert, body, cnvk.HIRA2KATA)
        logging.debug("result: %s" % cnvk.convert(body, cnvk.HIRA2KATA))

        logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title)
        calc_time(cnvk.convert, body, cnvk.Z_KATA, cnvk.KATA2HIRA)
        logging.debug("result: %s" % cnvk.convert(body, cnvk.Z_KATA, cnvk.KATA2HIRA))

        logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title)
        calc_time(cnvk.convert, body, cnvk.HIRA2KATA, cnvk.H_KATA)
        logging.debug("result: %s" % cnvk.convert(body, cnvk.KATA2HIRA, cnvk.H_KATA))

        logging.info("半角 to 全角 for %s" % title)
        calc_time(cnvk.convert, body, cnvk.Z_ASCII)
        logging.debug("result: %s" % cnvk.convert(body, cnvk.Z_ASCII))

        logging.info("全角 to 半角 for %s" % title)
        calc_time(cnvk.convert, body, cnvk.H_ASCII)
        logging.debug("result: %s" % cnvk.convert(body, cnvk.H_ASCII))
def sampling(http, cou, number):
    list2 = []
    html2 = requests.get(http)
    soup2 = BeautifulSoup(html2.content, "html.parser")
    [s.extract() for s in soup2('sup')]
    #[s.replace_with('削除済') for s in soup2(text =re.compile('#'))]
    title_and_trash = soup2.select('[class~=firstHeading]')
    title = title_and_trash[0].get_text()
    title = cnvk.convert(
        title, cnvk.Z_ASCII, cnvk.Z_KATA, {u"⋯": u"…"}, {u"–": u"―"},
        {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"},
        {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"},
        {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"},
        {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="})
    title, cou2 = filt(title, cou)
    starting_point = soup2.select('[class~=toclimit-3],[class~=toc]')
    if len(starting_point) == 0:
        return cou, False
    fo = codecs.open('Random_Contents' + '.txt', 'a', 'utf-8')
    fo2 = codecs.open('Random_Lists' + '.txt', 'a', 'utf-8')
    print(title)
    ti = '\n=' + title + '=\n'
    line2 = ti.strip()
    Len = len(line2)
    number += Len
    fo.write(ti)
    fo2.write(ti)
    fo.close()
    fo2.close()
    followers = starting_point[0].find_previous_siblings('p')
    for k in followers:
        follower = k.get_text()
        list2.append(follower)
        list2.reverse()
    for line in list2:
        line = cnvk.convert(line, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"},
                            {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"},
                            {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"},
                            {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"},
                            {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"},
                            {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"},
                            {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"},
                            {u"=": u"="})
        line, cou2 = filt(line, cou2)
        line2 = line.strip()
        Len = len(line2)
        number += Len
        fo = codecs.open('Random_Contents' + '.txt', 'a', 'utf-8')
        fo.write(line)
        fo.close()
    return cou2, html2
def TITLEs(url, cou):
    html2 = requests.get(url)
    soup2 = BeautifulSoup(html2.content, "html.parser")
    [s.extract() for s in soup2('sup')]
    title_and_trash = soup2.select('[class~=firstHeading]')
    title = title_and_trash[0].get_text()
    title = cnvk.convert(
        title, cnvk.Z_ASCII, cnvk.Z_KATA, {u"⋯": u"…"}, {u"–": u"―"},
        {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"},
        {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"},
        {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"},
        {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="})
    title, _ = filt(title, cou)
    return title
Exemplo n.º 4
0
 def tf_cnvk(text):
     # 半角を全角に統一(khcoderに投入するため)
     return cnvk.convert(text, cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA) #KATA2HIRA
Exemplo n.º 5
0
# coding: utf-8
'''
Created on 2014/02/20

@author: nahki
'''

import cnvk

if __name__ == '__main__':
    ex_word = ''
    of = open('kaomoji_zenkaku.csv', 'w', 1000)
    with open('kaomoji_jisho.txt', 'r', 1000) as f:
        for line in f:
            word = cnvk.convert(line.upper(), cnvk.Z_NUM, cnvk.Z_ALPHA,
                                cnvk.Z_KIGO, cnvk.Z_KATA).strip()
            if word == ex_word:
                ex_word = word
                print word
                continue
            cost = int(max(-32768, 6000 - 200 * len(word)**1.3))
            res = u"%s,,,%d,名詞,一般,*,*,*,*,*,*,*,%s\n" % (word, cost,
                                                         u"kaomoji")
            of.write(res)
            ex_word = word
    of.close()
Exemplo n.º 6
0
 def tf_cnvk(text):
     # 半角を全角に統一(khcoderに投入するため)
     return cnvk.convert(text, cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO,
                         cnvk.Z_KATA)  #KATA2HIRA
Exemplo n.º 7
0
  fin_name = 'jawiki-latest-all-titles-in-ns0'
  fout_name = 'wikipedia.csv'

  fin = codecs.open(fin_name, "r", "utf-8")
  fout = codecs.open(fout_name, "w", "utf-8")
  for line in fin:
    word = line.rstrip() 
    if len(word) <= 3: continue
    if re.compile(r'^[-.0-9]+$').search(word) is not None: continue
    if re.compile(r'/[0-9]{4}.').search(word) is not None: continue
    if re.compile(r'^\.').search(word) is not None: continue
    if re.compile(r',').search(word) is not None: continue
        
    print word
    
    word = word.upper()
#     word = word.replace(u'"', u'""')
#     word = word.replace(u'〜', u'~')
    """
    score = [-36000.0 ,-400 *(title.size**1.5)].max.to_i
        をちょっと変更することで、良い結果が得られました。
    naist-jdicの名詞の標準的なスコアはだいたい6000点ぐらいだったので、
        そこから16bitの符号付整数の最小値である-32768に向けてもうちょっと分布が広がるように調整してみました。
    score = [-32768.0, (6000 - 200 *(title.size**1.3))].max.to_i
        この数式だと日本語でだいたい20文字(utf-8で)ぐらいまでの名詞が 分布されるようになります。
    """
    cost = int(max(-32768, 6000 - 200*len(word)**1.3))
    word = cnvk.convert(word.replace(u'_', u' '), cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA).strip()
    fout.write(u"\"%s\",,,%d,名詞,一般,*,*,*,*,*,*,*,%s\n" % (word, cost, u"Wikipedia"))
  fin.close()
  fout.close()
Exemplo n.º 8
0
# coding: utf-8
'''
Created on 2014/02/20

@author: nahki
'''

import cnvk

if __name__=='__main__':
    ex_word = ''
    of = open('kaomoji_zenkaku.csv', 'w',1000)
    with open('kaomoji_jisho.txt', 'r', 1000) as f:
        for line in f:
            word = cnvk.convert(line.upper(), cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA).strip()
            if word == ex_word:
                ex_word = word
                print word
                continue
            cost = int(max(-32768, 6000 - 200*len(word)**1.3))
            res = u"%s,,,%d,名詞,一般,*,*,*,*,*,*,*,%s\n" % (word, cost, u"kaomoji")
            of.write(res)
            ex_word = word
    of.close()
Exemplo n.º 9
0
def make_stopwords():
    u"""コピペ用ストップワードを作成して表示

    """
    import mojimoji
    import cnvk
    stopwords=set()
    hira=u"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもらりるれろやゐゆゑよわをんがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽぁぃぅぇぉゃゅょっゔ"
    kata=[]
    for h in hira:
        kata.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA))
    kata.append(u"ヴ")
    hankata=[]
    for k in kata:
        hankata.append(mojimoji.zen_to_han(k))
    kazu=u"0123456789"
    stopwords.add(u"10")
    stopwords.add(u"11")
    stopwords.add(u"12")
    stopwords.add(u"13")
    stopwords.add(u"14")
    stopwords.add(u"15")
    stopwords.add(u"16")
    stopwords.add(u"17")
    stopwords.add(u"18")
    stopwords.add(u"19")
    stopwords.add(u"20")
    stopwords.add(u"10")
    stopwords.add(u"11")
    stopwords.add(u"12")
    stopwords.add(u"13")
    stopwords.add(u"14")
    stopwords.add(u"15")
    stopwords.add(u"16")
    stopwords.add(u"17")
    stopwords.add(u"18")
    stopwords.add(u"19")
    stopwords.add(u"20")
    zenkazu=mojimoji.han_to_zen(kazu)
    kazukan=u"一二三四五六七八九十百千万億兆"
    minialpha=u"abcdefghijklmnopqlstuvwxyz"
    bigalpha=u"ABCDEFGHIJKLMNOPQLSTUVWXYZ"
    han_minialpha=mojimoji.han_to_zen(minialpha)
    han_bigalpha=mojimoji.han_to_zen(bigalpha)
    hiramoji=[u"する",u"なる",u"てる",u"れる",u"やる",u"いる",u"さん",u"なん",u"くん",u"それ",u"こと",\
              u"ちゃん",u"ある",u"これ",u"して",u"くれる",u"くださる",u"そう",u"せる",u"した",u"いか",\
              u"ので",u"よう",u"てるん",u"もん",u"られる",u"あそこ",u"あたり",u"あちら",u"あっち",u"あと",\
              u"あな",u"あなた",u"あれ",u"いくつ",u"いつ",u"いま",u"いろいろ",u"うち",u"おおまか",u"おまえ",u"おれ",
              u"がい",u"かく",u"かたちの",u"かやの",u"から",u"がら",u"きた",u"こせ",u"ここ",u"こっち",u"こと",u"ごと",\
              u"こちら",u"これ",u"これら",u"ごろ",u"さまざま",u"さらい",u"しかた",u"しよう",u"すか",u"ずつ",u"すね",\
              u"そう",u"そこ",u"そちら",u"そっち",u"そで",u"それ",u"それぞれ",u"それなり",u"たくさん",u"たち",u"たび",\
              u"ため",u"ちゃ",u"てん",u"とおり",u"とき",u"どこ",u"どこか",u"ところ",u"どちら",u"どれ",u"なか",u"なかば",\
              u"なに",u"など",u"なん",u"はじめ",u"はず",u"はるか",u"ひと",u"ひとつ",u"ふく",u"ぶり",u"べつ",u"へん",u"べん",\
              u"ほう",u"ほか",u"まさ",u"まし",u"まとも",u"まま",u"みたい",u"みつ",u"みなさん",u"みんな",u"もと",u"もの",\
              u"もん",u"やつ",u"よう",u"よそ",u"わけ",u"わたし",u"くる",u"すぎる",u"れる",u"いう",u"くださる",u"ちゃう",\
              u"つく",u"せる",u"てるん",u"すぎ",u"ところ",u"おれ",u"ぼく",u"わたし",u"てる",u"しまう",u"みる",
              ]

    katamoji=[]
    for h in hiramoji:
        katamoji.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA))
    han_katamoji=[]
    for k in katamoji:
        han_katamoji.append(mojimoji.zen_to_han(k))

    kanmoji=["笑","今","気","今日","明日","方","人","俺","私","僕","時","思う","行く","言う","見る","出す","年","月","日","分","秒","週","火","水","木","金","土","国","都",\
             "道","府","県","市","区","町","村","各","第","何","的","度","達","誰","者","類","用","別","等","際","系","品","化","所","毎","回","匹","個","席","束","歳","円","毎",\
             "前","後","左","右","次","先","春","夏","秋","冬","下記","上記","時間","今回","前回","場合","自分","ヶ所","ヵ所","カ所","箇所","ヶ月","カ月","箇月","名前","本当","確か","時点",\
             "様々","結局","半ば","以前","以後","以降","未満","以上","以下","毎日","自体","何人","手段","感じ","同じ","点","君"]

    h_kigou=cnvk.H_KIGO
    kigou=[]
    for h in h_kigou:
        for x in h:
            kigou.append(x)
    kigou.append(u"ω")
    kigou.append(u'ー')
    kigou.append(u"д")

    #参考 内容推測に適したキーワード抽出のための日本語ストップワード(https://www.jstage.jst.go.jp/article/jjske/12/4/12_511/_pdf)
    kokubu_words=[u"ない",u"高い",u"多い",u"少ない","強い","大きい","小さい","長い","ながい",
                  u"良い",u"よい",u"いい","悪い",
                  u"ある","いる","なる","行く","いく","来る","とる",
                  "見る","みる","言う","いう","得る","過ぎる","すぎる",
                  "する","やる","行なう","行う","おこなう","出来る","できる",
                  "おもう","思う","考える","かんがえる","わかる","見える",
                  "知る","しれる","いえる","示す","述べる","書く","かく","よる",
                  "異なる","違う","ちがう","くらべる",
                  "入れる","出る","でる","入る","はいる",
                  "使う","用いる","もちいる","持つ","もつ","作る","つくる",
                  "なす","起こる","おこる","つく","つける","聞く","よぶ",
                  "かれる","つまり","上","下","次","つぎ",
                  "わが国","自分","人々","人びと","別","他","間","話","例","形","日","家","手","名","身",
                  "そのもの","一つ","あと",

                  #2016/01/24 更に偏在度の高いものと、忘れてたひらがなを追加
                  "きゃ","きゅ","きょ","しゃ","しゅ","しょ","ちゃ","ちゅ","ちょ","にゃ","にゅ","にょ",
                  "ひゃ","ひゅ","ひょ","みゃ","みゅ","みょ","りゃ","りゅ","りょ","ゎ",
                  "事","目","とこ","中","字","お前","全部","きみ","もらう",
                  ]

    for h in hira:
        stopwords.add(h)
    for k in kata:
        stopwords.add(k)
    for h in hankata:
        stopwords.add(h)
    for k in kazu:
        stopwords.add(k)
    for z in zenkazu:
        stopwords.add(z)
    for k in kazukan:
        stopwords.add(k)
    for m in minialpha:
        stopwords.add(m)
    for b in bigalpha:
        stopwords.add(b)
    for h in han_minialpha:
        stopwords.add(h)
    for h in han_bigalpha:
        stopwords.add(h)
    for h in hiramoji:
        stopwords.add(h)
    for k in katamoji:
        stopwords.add(k)
    for h in han_katamoji:
        stopwords.add(h)
    for k in kanmoji:
        stopwords.add(unicode(k))
    for k in kigou:
        stopwords.add(k)
    for k in kokubu_words:
        stopwords.add(unicode(k))
    print "set([",
    for s in sorted(stopwords):
        print "u\"{0}\",".format(s),
    print "])"
Exemplo n.º 10
0
import re
import cnvk
jisx0208 = []
with open("unicode.txt", "r", encoding="utf-8_sig") as f:
    for line in f:
        line = line.strip()
        jisx0208.append(line)
jisx0208.append('\n')
jisx0208.append(" ")
Jisx0208 = set(jisx0208)
"""
with open("Wikitexttrueb.full","r") as f, open("Wikitexttruea.full","a") as g:
	for line in f:
		#line = cnvk.convert(line,cnvk.Z_ASCII,cnvk.Z_KATA)
		line=re.sub('[0-90-9]+','*',line)
		#line=re.sub("\*[\*]+","*",line)
		g.write(line)
"""
with open("Random_Contentsadjusttrueb.full",
          "r") as f, open("Random_Contentsadjusttrueaver2.full", "a") as g:
    for line in f:
        line = cnvk.convert(line, cnvk.Z_ASCII, cnvk.Z_KATA)
        line = re.sub('[0-90-9]+', '*', line)
        #line=re.sub("\*[\*]+","*",line)
        g.write(line)
Exemplo n.º 11
0
# coding: utf-8
'''
Created on 2014/02/18

@author: nahki
'''

import MeCab
import cnvk
import unicodedata
from separatewords import MecabTokenize

tagger = MeCab.Tagger("-Ochasen")

# 明日のジェル検。めっちゃ緊張する…。お腹痛い_(´;ω;‘_)⌒)_。ハンドモデルママンにお願いするけど。ママンの爪大丈夫かしら…?。いまさら不安…
text = cnvk.convert(u'明日ママ を 芦田愛菜 に空目。', cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO,
                    cnvk.Z_KATA)
res = MecabTokenize.tokenize(text)
print res
for r in res:
    print "result", r
"""
node = tagger.parseToNode(text.encode('utf-8')) #(´・ω・`)
while node:
    #print "%s %s" % (node.surface, node.feature)
    print node.surface, node.feature
    node = node.next
"""
Exemplo n.º 12
0
    fin = codecs.open(fin_name, "r", "utf-8")
    fout = codecs.open(fout_name, "w", "utf-8")
    for line in fin:
        word = line.rstrip()
        if len(word) <= 3: continue
        if re.compile(r'^[-.0-9]+$').search(word) is not None: continue
        if re.compile(r'/[0-9]{4}.').search(word) is not None: continue
        if re.compile(r'^\.').search(word) is not None: continue
        if re.compile(r',').search(word) is not None: continue

        print word

        word = word.upper()
        #     word = word.replace(u'"', u'""')
        #     word = word.replace(u'〜', u'~')
        """
    score = [-36000.0 ,-400 *(title.size**1.5)].max.to_i
        をちょっと変更することで、良い結果が得られました。
    naist-jdicの名詞の標準的なスコアはだいたい6000点ぐらいだったので、
        そこから16bitの符号付整数の最小値である-32768に向けてもうちょっと分布が広がるように調整してみました。
    score = [-32768.0, (6000 - 200 *(title.size**1.3))].max.to_i
        この数式だと日本語でだいたい20文字(utf-8で)ぐらいまでの名詞が 分布されるようになります。
    """
        cost = int(max(-32768, 6000 - 200 * len(word)**1.3))
        word = cnvk.convert(word.replace(u'_', u' '), cnvk.Z_NUM, cnvk.Z_ALPHA,
                            cnvk.Z_KIGO, cnvk.Z_KATA).strip()
        fout.write(u"\"%s\",,,%d,名詞,一般,*,*,*,*,*,*,*,%s\n" %
                   (word, cost, u"Wikipedia"))
    fin.close()
    fout.close()
def sampling_detail(http, cou, number):
    List = []
    soup2 = BeautifulSoup(http.content, "html.parser")
    [s.extract() for s in soup2('sup')]
    [s.extract() for s in soup2('annotation')]
    [s.extract() for s in soup2('.mw-editsection')]
    [s.extract() for s in soup2.select('.gallerybox')]
    [s.extract() for s in soup2.select('.mbox-text')]
    [s.extract() for s in soup2.select('.geo-multi-punct')]
    [s.extract() for s in soup2.select('.geo-nondefault')]
    [s.extract() for s in soup2.select('.geo-default')]
    [s.extract() for s in soup2.select('.plainlist')]
    block = soup2.select('h2 > span[class~=mw-headline]')
    for i in block:
        over = i.prettify()
        if over.find('id="出典"') > -1 or over.find('id="注釈"') > -1 or over.find(
                'id="脚注"') > -1 or over.find('id="註釈"') > -1 or over.find(
                    'id="外部リンク"') > -1:
            break
        item1 = i.get_text()
        item1 = cnvk.convert(item1, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"},
                             {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"},
                             {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"},
                             {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"},
                             {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"},
                             {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"},
                             {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"},
                             {u"=": u"="})
        item1, cou = filt(item1, cou)
        if item1.find('注釈') > -1 or item1.find('脚注') > -1 or item1.find(
                '註釈') > -1:
            break
        List.append('\n==' + item1 + '==\n')
        texts = i.find_all_next(
            ['h2', 'h3', 'h4', 'p', 'li', 'dd', 'dt', 'blockquote'])
        overlap = []
        temp2_prev = texts[0].prettify()
        temp2_tx_prev = ''
        for j in texts:
            temp2 = j.prettify()
            if temp2.find('h2') > -1:
                break
            elif temp2.find('h3') > -1 and temp2.find('mw-headline') > -1:
                heading2 = j.select('.mw-headline')
                item2 = heading2[0].get_text()
                item2 = cnvk.convert(item2, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"},
                                     {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"},
                                     {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"},
                                     {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"},
                                     {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"},
                                     {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"},
                                     {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"},
                                     {u"=": u"="})
                item2, cou = filt(item2, cou)
                if item2 not in overlap:
                    List.append('\n===' + item2 + '===\n')
                    overlap.append(item2)
                temp2_prev = temp2
            elif temp2.find('h4') > -1 and temp2.find('mw-headline') > -1:
                heading3 = j.select('.mw-headline')
                item3 = heading3[0].get_text()
                item3 = cnvk.convert(item3, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"},
                                     {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"},
                                     {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"},
                                     {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"},
                                     {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"},
                                     {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"},
                                     {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"},
                                     {u"=": u"="})
                item3, cou = filt(item3, cou)
                if temp2_prev.find('h3') == -1 and item3 not in overlap:
                    List.append('\n===' + item3 + '===\n')
                    overlap.append(item3)
                if temp2_prev.find('h3') > -1 and item3 not in overlap:
                    List.append('\n====' + item3 + '====\n')
                    overlap.append(item3)
                temp2_prev = temp2
            elif temp2.find('blockquote') > -1:
                text0 = j.get_text()
                text0 = cnvk.convert(text0, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"},
                                     {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"},
                                     {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"},
                                     {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"},
                                     {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"},
                                     {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"},
                                     {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"},
                                     {u"=": u"="})
                text0, cou = filt(text0, cou)
                if text0 not in overlap:
                    List.append('<block>' + text0 + '<block>\n')
                    overlap.append(text0)
                temp2_tx_prev = text0
                temp2_prev = temp2
            elif temp2.find('<dt>') > -1:
                item4 = j.get_text()
                item4 = cnvk.convert(item4, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"},
                                     {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"},
                                     {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"},
                                     {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"},
                                     {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"},
                                     {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"},
                                     {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"},
                                     {u"=": u"="})
                item4, cou = filt(item4, cou)
                if temp2_tx_prev.find(item4) == -1:
                    if temp2_prev.find('h3') == -1 and item4 not in overlap:
                        List.append('\n===' + item4 + '===\n')
                        overlap.append(item4)
                    if temp2_prev.find('h3') > -1 and item4 not in overlap:
                        List.append('\n====' + item4 + '====\n')
                        overlap.append(item4)
                    temp2_prev = temp2
            elif temp2.find('<p>') > -1 or temp2.find(
                    '<li>') > -1 or temp2.find('<dd>') > -1:
                if temp2.find('mwe-math-element') == -1:
                    text = j.get_text()
                    text = cnvk.convert(
                        text, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"}, {u"–": u"―"},
                        {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"},
                        {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"},
                        {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"},
                        {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"},
                        {u"⁄": u"/"}, {u"=": u"="})
                    text, cou = filt(text, cou)
                    if temp2_tx_prev.find(text) == -1:
                        if text not in overlap:
                            List.append(text)
                            overlap.append(text)
                elif temp2.find('mwe-math-element') > -1:
                    text = '<math-element>\n'
                    if temp2_tx_prev.find(text) == -1:
                        List.append(text)
                temp2_prev = temp2
    fo = codecs.open('Random_Contents' + '.txt', 'a', 'utf-8')
    for line in List:
        print(line)
        line2 = line.strip()
        Len = len(line2)
        number += Len
        fo.write(line)
    fo.write('\n')
    fo.close()
    return cou
Exemplo n.º 14
0
# coding: utf-8
'''
Created on 2014/02/18

@author: nahki
'''

import MeCab
import cnvk
import unicodedata
from separatewords import MecabTokenize

tagger = MeCab.Tagger("-Ochasen")


# 明日のジェル検。めっちゃ緊張する…。お腹痛い_(´;ω;‘_)⌒)_。ハンドモデルママンにお願いするけど。ママンの爪大丈夫かしら…?。いまさら不安…
text = cnvk.convert(u'明日ママ を 芦田愛菜 に空目。',
                     cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA)
res = MecabTokenize.tokenize(text)
print res
for r in res:
    print "result",r
"""
node = tagger.parseToNode(text.encode('utf-8')) #(´・ω・`)
while node:
    #print "%s %s" % (node.surface, node.feature)
    print node.surface, node.feature
    node = node.next
"""