Пример #1
0
def tokenize(text):
    t = Tokenizer()
    # テキストの先頭にあるヘッダとフッタを削除
    text = re.split(r'\-{5,}',text)[2]
    text = re.split(r'底本:', text)[0]
    text = text.strip()
    # ルビを削除
    text = text.replace('|', '')
    text = re.sub(r'《.+?》', '', text)
    # テキスト内の脚注を削除
    text = re.sub(r'[#.+?]', '', text)
    # 一行ずつ処理
    lines = text.split("\r\n")
    results = []
    for line in lines:
        res = []
        tokens = t.tokenize(line)
        for tok in tokens:
            bf = tok.base_form # 基本系
            if bf == "*": bf = tok.surface
            ps = tok.part_of_speech # 品詞情報
            hinsi = ps.split(',')[0]
            if hinsi in ['名詞', '動詞', '形容詞', '記号']:
                res.append(bf)
        l = " ".join(res)
        results.append(l)
    return results
def separatewords(text):
    separatedWord=[]
    t=Tokenizer()
    tokens=t.tokenize(unicode(text, "utf-8"))
    
    for token in tokens:
        posList=token.part_of_speech.split(",")

        pos1=posList[0]
        if isinstance(pos1, unicode):
          pos1=pos1.encode("utf-8")

        pos2=posList[1]
        if isinstance(pos2, unicode):
          pos2=pos2.encode("utf-8")

        ruby=token.reading
        if isinstance(ruby, unicode):
          ruby=ruby.encode("utf-8")

        if pos1=="名詞":
            if pos2!="接尾" and pos2!="代名詞" and pos2!="非自立" and pos2!="数" and pos2!="形容動詞語幹":
                if ruby!="*":
                    separatedWord.append(token.surface.lower())
                    print token.surface.lower()
                elif pos2!="サ変接続" and len(token.surface)>3:
                    # 英単語に関しては4文字以上の単語を扱う
                    separatedWord.append(token.surface.lower())
                    print token.surface.lower()

    return separatedWord
Пример #3
0
def chunk_with_kanji(istr):
    t = Tokenizer()
    tokens = t.tokenize(istr)

    # give each element flags (jiritsu or fuzoku)
    flags = [judge_jifu(x.part_of_speech) for x in tokens]
    
    surface = [x.surface for x in tokens]

    # split to chunks, delimited by KUGIRI flag
    # very ugly. should be rewritten using tree structure etc.
    cflags = insert_chunkflg(flags)
    rstr = u""
    i = 0
    for j, f in enumerate(flags):
        if i >= len(cflags): break
        if cflags[i] == KUGIRI:
            if f == KUTOU: 
                rstr += surface[j]
                i += 1
            else:
                rstr += u" "
                rstr += surface[j]
                i += 2
        else:
            rstr += surface[j]
            i += 1

    # don't know why this is necessary
    if flags != [] and j == 0 and len(surface) != 1: 
        while j  < len(surface):
            rstr += surface[j]    
            j += 1

    return rstr
Пример #4
0
def text_to_array_ja(textdata, wordtypes):
    textdata = filter(textdata)
    t = Tokenizer()
    tokens = t.tokenize(textdata)
    words = sorted([token.surface
                    for token in tokens
                    if token.part_of_speech.split(',')[0] in wordtypes])
    return words
Пример #5
0
def output_ja_text(data, wordtypes):
    textdata = filter(data)
    t = Tokenizer()
    tokens = t.tokenize(textdata)
    words = sorted([token.surface
                    for token in tokens
                    if token.part_of_speech.split(',')[0] in wordtypes])
    dictionary = count_words(words)
    return pyaml.dump(dictionary, sys.stdout, vspacing=[0, 1])
Пример #6
0
def _tokenize(text):
    from collections import namedtuple
    Token = namedtuple("Token", ["t", "surface", "pos"])

    t = Tokenizer()
    tokens = t.tokenize(text)
    for t in tokens:
        nt = Token(t, t.surface, t.part_of_speech.split(","))
        yield nt
Пример #7
0
def makekeywords(text):
    from janome.tokenizer import Tokenizer
    t = Tokenizer()
    tokens = t.tokenize(text)
    keywords = []
    for token in tokens:
        if token.part_of_speech.find("名詞") >= 0 and token.part_of_speech.find("数") == -1 and token.part_of_speech.find("非自立") == -1 and token.part_of_speech.find("接尾") == -1:
            keywords.append(token.surface)
    return keywords
Пример #8
0
def test_func():
    t = Tokenizer()
    temp = ""
    for token in t.tokenize(u'この腫瘍は間葉系組織から生ずると考えられ、ビメンチンを発現する。'):
        if (not re.search('^(助詞|助動詞|記号)', token.part_of_speech)):
            temp = temp + token.surface
        else:
            temp = temp + token.surface
            print(temp)
            temp = ""
Пример #9
0
 def split(self, text):
     result = []
     t = Tokenizer()
     malist = t.tokenize(text)
     for w in malist:
         sf = w.surface   # 区切られた単語そのまま 
         bf = w.base_form # 単語の基本形
         if bf == '' or bf == "*": bf = sf
         result.append(bf)
     return result
Пример #10
0
    def run(self, force=None):
        print('start')
        # 全サイト取得と重複排除
        sites = {}
        for site in Site.get_all():
            sites[site.url] = site

        # リストに対してignoreとkeywordマッチを排除
        sure = []
        for key in sites:
            site = sites[key]
            response = requests.get(site.subjects_url)
            assert (response.status_code == 200), response.text

            # parse
            data = list(response.text.split('\n'))
            for line in data:
                try:
                    _ = Subject(site, line)
                    sure.append(_)
                except:
                    pass

        print(sure)

        # リスト出力
        t = Tokenizer()
        r = defaultdict(int)
        r2 = defaultdict(list)
        r3 = defaultdict(int)
        for _sure in sure:
            try:
                for token in t.tokenize(_sure.title):
                    if not token_filter(token):
                        r[token.surface] += 1
                        r2[token.surface] += [_sure]
                        r3[token] += 0
            except:
                pass

        # sort
        sure = sorted(sure, key=lambda x: x.title)

        for _sure in sure:
            try:
                point = 0
                for token in t.tokenize(_sure.title):
                    if not token_filter(token):
                        point += r[token.surface]
                if not filter_title(point, _sure):
                    print(_sure.title, _sure.count_res)

            except:
                pass
Пример #11
0
    def understand_move(self, text):
        generator = Tokenizer()
        tokens = []

        for t in generator.tokenize(text):
            tokens.append(t)

        direction = self._understand_direction(tokens)
        distance = self._understand_distance(tokens)

        return direction, distance
Пример #12
0
def main():
    """
    >>> main()
    すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
    も	助詞,係助詞,*,*,*,*,も,モ,モ
    もも	名詞,一般,*,*,*,*,もも,モモ,モモ
    も	助詞,係助詞,*,*,*,*,も,モ,モ
    もも	名詞,一般,*,*,*,*,もも,モモ,モモ
    の	助詞,連体化,*,*,*,*,の,ノ,ノ
    うち	名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
    """
    t = Tokenizer()
    for token in t.tokenize(u'すもももももももものうち'):
        print(token)
Пример #13
0
def get_morphs(string):
    t = Tokenizer()
    dicts=[]
    for token in t.tokenize(unicode(string, 'utf-8')):
        dic = {}
        token_list = str(token).replace("	", ",").split(",")
        dic["surface"] = token_list[0]
        dic["base"] = token_list[7]
        dic["pos"] = token_list[1]
        dic["pos1"] = token_list[2]

        dicts.append(dic)

    return dicts
Пример #14
0
class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        try:
            from janome.tokenizer import Tokenizer
        except ImportError:
            raise ImportError("The Japanese tokenizer requires the Janome "
                              "library: https://github.com/mocobeta/janome")
        self.tokenizer = Tokenizer()

    def __call__(self, text):
        words = [x.surface for x in self.tokenizer.tokenize(text)]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))

    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
    def to_bytes(self, **exclude):
        return b''

    def from_bytes(self, bytes_data, **exclude):
        return self

    def to_disk(self, path, **exclude):
        return None

    def from_disk(self, path, **exclude):
        return self
Пример #15
0
class MainTranslator(object):
	def __init__ (self):
		self.janome= Tokenizer()
		
	def get_gobi(self, n):
		f = n.part_of_speech.split(',')	
		if n.surface in ['だ','です','た','だろ','ある']:
			if f[0] == '助動詞': 
				return 'ハゲ'
		
		if n.surface in ['無い','ない','ぬ']:
			if f[0] == '助動詞':
				return 'ぬハゲ' 
			if f[0] == '形容詞':
				return 'なしハゲ'
			
	
	def Translator(self, text):
		tokens = self.janome.tokenize(text)
		text = ''
		for n in tokens:
			f = n.part_of_speech.split(',')
			if n.surface in converter:
				text += converter[n.surface]
			elif len(f) > 3:
				gobi = self.get_gobi(n)
				if gobi is not None:
					text += gobi
				else:
					text += n.surface
			else:
				text += n.surface
		
		return text
def janome_tokenizer(sentence):
    t = Janome_Tokenizer()
    sentence = sentence.decode("utf-8")
    try:
        tokens = t.tokenize(sentence)
    except:
        try:
            tokens = t.tokenize(sentence.replace(u"\xa0", u"、"))
        except:
            try:
                tokens = t.tokenize(sentence.replace(u"\xa0", u""))
            except:
                print ("Tokenization error at sentence: "+sentence.encode("utf-8"))
                return  [sentence]

    return [dic.surface.encode("utf-8") for dic in tokens]
Пример #17
0
 def __init__(self, cls, nlp=None):
     self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
     try:
         from janome.tokenizer import Tokenizer
     except ImportError:
         raise ImportError("The Japanese tokenizer requires the Janome library: "
                           "https://github.com/mocobeta/janome")
     self.tokenizer = Tokenizer()
Пример #18
0
def do_analysis(analyzed_file):
    result_file_name = "result/" + str(analyzed_file)

    p = re.compile(r'\s(.*)')

    t = Tokenizer()
    
    with open(analyzed_file,mode='r', encoding='utf-8') as read_file:
        texts = read_file.read()
    
    with open(result_file_name, mode='a', encoding='utf-8') as result_file:
        for token in t.tokenize(str(texts)):
            check_word = p.sub('',str(token))
            if not word_match.word_check(check_word):
                result_file.write(str(token) + "\n")
        
    return result_file_name
Пример #19
0
    def __init__(self):
        QWidget.__init__(self)
        self.setupUi(self)
        self.setWindowTitle(
            QApplication.translate("Widget", "%s %s" % (__app_name__, __version__), None, QApplication.UnicodeUTF8))

        self.jacome_token = Tokenizer()
        self.words_container = []
Пример #20
0
def analyze_keyword(posts):
    """
    投稿を形態素解析して頻出ワードで重み付けして
    キーワードから出現数と投稿の逆索引を生成する。
    :param posts: dict{int: Posted}
    :rtype: list(KeywordReverseIndex)
    """
    t = Tokenizer()
    tfidf2 = defaultdict(int)
    tfidf2_post = defaultdict(list)

    # 単語毎の重み付け
    for key in posts:
        post = posts[key]
        for message in post.parse_post_message:
            # Aタグ排除
            soup = BeautifulSoup(message, "lxml")

            # janome
            _prev_token = None
            try:
                for token in t.tokenize(soup.text):
                    # tokenが助詞なら相手しない
                    if final_filter(_prev_token, token):
                        tfidf2[_prev_token.surface + token.surface] += 1
                        if post not in tfidf2_post[_prev_token.surface + token.surface]:
                            tfidf2_post[_prev_token.surface + token.surface] += [post]

                    _prev_token = token

                    # tokenが助詞ならtfidf2の先頭文字から除外
                    if token_is_sub(token):
                        _prev_token = None
            except:
                pass

    # 逆索引の生成
    r_indexes = []
    for key in tfidf2:
        _index = KeywordReverseIndex(key, tfidf2[key], tfidf2_post[key])

        # 出現数が一定以上のキーワードのみindexを生成する
        if _index.is_enable:
            r_indexes.append(_index)
    return r_indexes
Пример #21
0
def callback():
    messages = request.json['result']

    for message in messages:
        text = message['content']['text']
        for matcher, action in commands:
            if matcher.search(text):
                response = action(text)
                break
        else:
            post_text(message['content']['from'], '解析中...')
            # 形態素解析
            response = ''
            t = Tokenizer()
            for token in t.tokenize(message['content']['text']):
                response += str(token) + '\n'
        post_text(message['content']['from'], response)
    return ''
Пример #22
0
def add_yomi(string):
    t = Tokenizer()
    tokens = t.tokenize(string)

    rstring = ''
    for token in tokens:
        s = token.surface
        r = token.reading

        while True:
            res = split_at_hiragana(s, r)
            if len(res) > 2:
                rstring += create_yomi(res[0], res[1])
                s, r = res[2], res[3]
            else:
                break

        rstring += create_yomi(res[0], res[1])

    return rstring
Пример #23
0
class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        try:
            from janome.tokenizer import Tokenizer
        except ImportError:
            raise ImportError("The Japanese tokenizer requires the Janome library: "
                              "https://github.com/mocobeta/janome")
        self.tokenizer = Tokenizer()

    def __call__(self, text):
        words = [x.surface for x in self.tokenizer.tokenize(text)]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))
Пример #24
0
class MyWindow(QWidget, Ui_window):
    loaded_words = Signal(str)
    refresh_words = Signal()

    def __init__(self):
        QWidget.__init__(self)
        self.setupUi(self)
        self.setWindowTitle(
            QApplication.translate("Widget", "%s %s" % (__app_name__, __version__), None, QApplication.UnicodeUTF8))

        self.jacome_token = Tokenizer()
        self.words_container = []

    @Slot()
    def analysis_janome(self, s):
        # デバッグ用に解析文章をstdout
        print s

        # 解析結果の表示要素を初期化(クリア)
        self.refresh_words.emit()
        # 解析結果格納配列を初期化(csv保存用)
        self.words_container = []
        # 形態素解析を実行
        tokens = self.jacome_token.tokenize(s)

        for token in tokens:
            # 解析結果をstring(UNICODE)型へキャスト
            print_str = str(token).decode('utf8')
            # csv保存用に解析結果を格納
            self.words_container.append(print_str)

        # 解析結果を出力
        self.loaded_words.emit('\n'.join(self.words_container))

    @Slot()
    def save_csv(self):
        filename = 'result.csv'
        filename = os.path.normpath(filename)

        # OSを判定してエンコードを設定
        if os.name is 'nt':
            code = 'cp932'
        else:
            code = 'utf-8'
        print 'save_csv: code = %s' % code

        with open(filename, 'wb') as f:  # 'wb'じゃないと変な改行入る。
            writer = csv.writer(f, delimiter=',')
            for words in self.words_container:
                out_word = words.encode(code)
                writer.writerow([out_word])
Пример #25
0
    def __init__(self, text):
        # テキストをいい感じにする
        if isinstance(text, str):
            text = text.decode("utf-8")
        # 半角記号を削除
        half_symbol = re.compile("[!-/:-@[-`{-~]")
        text = half_symbol.sub("", text)

        self.text = text

        self.t = Tokenizer(wakati=True)

        # 生成する文章の数を指定
        self.sentence_num = 5
        # 生成する文章の文字数の大ざっぱな上限
        self.stop_length = 110
Пример #26
0
class Mave(object):
    def __init__(self, name=u'メイ'):
        self.name = name
        self.msg_que = Queue()
        self.tokenizer = Tokenizer()

        self.markov = Markov(ngram=2)

    def wakeUp(self):
        try:
            self.markov.load(u'mave_%s.json' % self.name)
        except Exception as e:
            print 'markov load failure'
            print e
            self.markov = Markov(ngram=2)

    def goToBed(self):
        self.markov.save(u'mave_%s.json' % self.name)


    def listenTo(self, message, talker):
        tokens = self.tokenizer.tokenize(message.decode('utf-8'))
        for tok in tokens:
            print '%10s (%10s) ... %s' % (tok.surface, tok.reading, tok.part_of_speech)

        self.markov.learn(tokens)

        meishi_list = [tok.surface for tok in tokens 
                       if u'名詞' in tok.part_of_speech.split(',') and
                          ((u'一般' in tok.part_of_speech.split(',') and  not u'あ' <= tok.surface[0] <= u'ん')
                           or u'固有名詞' in tok.part_of_speech.split(','))]

        key = random.choice(meishi_list) if len(meishi_list) != 0 else None
        rsp = self.markov.generate(key)
        if rsp != None:
            self.msg_que.put(rsp)
        else:
            self.msg_que.put('はいはい > %s' % talker)

    def speak(self):
        if self.msg_que.empty():
            return None

        return self.msg_que.get()
 def __init__(self):
     self.tokenizer = Tokenizer(wakati=True)
     self.excludes = ["。", "、", "(", ")"]
     self.exclude_nodes = ["cite", "script", "style"]
Пример #28
0
from pathlib import Path
import csv, json
import pandas as pd
import glob

fo_names = ['test_txt', 'csv']

for i in fo_names:
    try:
        os.makedirs('../{}'.format(i))
    except:
        continue


file_number = 0
t = Tokenizer()
path = Path(sys.argv[1] if len(sys.argv) >= 2 else '.')
for path_in in [x for x in path.glob('*.txt') if x.is_file()]:
    path_out = path_in.with_suffix('.txt')
    path_csv = path_in.with_suffix('.csv')
    file = open(path_in, 'r')
    file_number += 1
    bunsyou = file.readlines()
    syori_bunsyou = []
    moji = []
    mojisuu = 0

    for i in range(len(bunsyou)):
        bunsyou[i] = bunsyou[i].strip()
        syori_bunsyou.append(bunsyou[i].strip())
Пример #29
0
class Dictionary:
    """
    思考エンジンのクラス。

    クラス変数:
    DICT_RANDOM -- ランダム辞書のファイル名。
    DICT_PATTERN -- パターン辞書のファイル名。
    TOKENIZER -- 形態素解析ツールjanomeの分析オブジェクト

    プロパティ:
    random -- ランダム辞書
    pattern -- パターン辞書
    """

    DICT_RANDOM = 'dics/random.txt'
    DICT_PATTERN = 'dics/pattern.txt'

    TOKENIZER = Tokenizer()

    def __init__(self):
        """
        ファイルからの辞書の読み込みを行う。
        """
        with open(Dictionary.DICT_RANDOM, encoding='utf-8') as f:
            self._random = [x for x in f.read().splitlines() if x]

        with open(Dictionary.DICT_PATTERN, encoding='utf-8') as f:
            self._pattern = [
                Dictionary.make_pattern(l) for l in f.read().splitlines() if l
            ]

    def study(self, text):
        """
        ランダム辞書、パターン辞書をメモリに保存する。
        """
        self.study_random(text)
        self.study_pattern(text, Dictionary.analyze(text))

    def study_random(self, text):
        """
        ユーザの発言textをメモリに保存する。
        すでに同じ発言があった場合は何もしない。
        """
        if not text in self._random:
            self._random.append(text)

    def study_pattern(self, text, parts):
        """
        ユーザの発言textを形態素partsに基づいてパターン辞書に保存する。
        """
        for word, part in parts:
            if self.is_keyword(part):  # 品詞が名詞であれば学習。
                # 単語の重複チェック
                # 同じ単語で登録されていれば、パターンを追加する
                # 無ければ新しいパターンを作成する
                duplicated = next(
                    (p for p in self._pattern if p['pattern'] == word), None)
                if duplicated:
                    if not text in duplicated['phrases']:
                        duplicated['phrases'].append(text)
                else:
                    self._pattern.append({'pattern': word, 'phrases': [text]})

    def save(self):
        """
        メモリ上の辞書をファイルに保存する。
        """
        with open(Dictionary.DICT_RANDOM, mode='w', encoding='utf-8') as f:
            f.write('\n'.join(self.random))

    @staticmethod
    def make_pattern(line):
        """
        文字列lineを\tで分割し、{'pattern':[0], 'pharases':[1]}の形式で返す。
        """
        pattern, phrases = line.split('\t')
        if pattern and phrases:
            return {'pattern': pattern, 'phrases': phrases.split('|')}

    @staticmethod
    def analyze(text):
        """
        文字列を形態素解析し、[(surface, parts)]の形にして返す。
        """
        return [(t.surface, t.part_of_speech)
                for t in Dictionary.TOKENIZER.tokenize(text)]

    @staticmethod
    def pattern_to_line(pattern):
        """
        パターンのハッシュを文字列に変換する。
        """
        return '{}\t{}'.format(pattern['pattern'],
                               '|'.join(pattern['phrases']))

    @staticmethod
    def is_keyword(part):
        """
        品詞partが学習すべきキーワードであるかどうか真偽値で返す。
        """
        return bool(re.match(r'名詞,(一般|代名詞|固有名詞|サ変接続|形容動詞語幹)', part))

    @property
    def random(self):
        """
        ランダム辞書
        """
        return self._random

    @property
    def pattern(self):
        """
        パターン辞書
        """
        return self._pattern
Пример #30
0
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from janome.tokenizer import Tokenizer


#参考にしたページ・ソースコード
#https://blog.amedama.jp/entry/tf-idf


#tokenizerの初期化
janome_tokenizer  = Tokenizer()

def text_morpheme( text, part = "", part2 = ""):
    """janomeで形態素に分ける

    Arguments:
        text {[type]} -- 形態素に分ける文字列

    Keyword Arguments:
        part {str} -- 取得する品詞を指定(品詞の設定がない場合はすべて取得)
        part2 {str} -- サ変名詞などの2つ目の品詞

    Returns:
        [type] -- 形態素に分けた結果(リストで返す)
    """
    text_list = []
    for token in janome_tokenizer.tokenize(text):
        #print(token.part_of_speech)
Пример #31
0
# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer
import sys
from io import open

PY3 = sys.version_info[0] == 3

print(u'Tokenize (stream mode)')
t = Tokenizer(mmap=True)

with open('text_lemon.txt', encoding='utf-8') as f:
    text = f.read()
    if not PY3:
        text = unicode(text)
    for token in t.tokenize(text, stream=True):
        print(token)
Пример #32
0
from janome.tokenizer import Tokenizer
from gensim.models import word2vec
import re

# テキストファイルの読み込み
bindata = open('kokoro.txt.sjis', 'rb').read()
text = bindata.decode('shift_jis')

# テキストの先頭にあるヘッダーとフッターを削除
text = re.split(r'\-{5,}', text)[2]
text = re.split(r'底本:', text)[0]
text = text.strip()

# 形態素解析
t = Tokenizer()
results = []
# テキストを一行ずつ処理する
lines = text.split("\r\n")
for line in lines:
    s = line
    s = s.replace('|', '')
    s = re.sub(r'《.+?》', '', s)  # ルビを削除
    s = re.sub(r'[#.+?]', '', s)  # 入力注を削除
    tokens = t.tokenize(s)  # 形態素解析
    # 必要な語句だけを対象とする
    r = []
    for tok in tokens:
        if tok.base_form == "*":  # 単語の基本形を採用
            w = tok.surface
        else:
            w = tok.base_form
Пример #33
0
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from janome.tokenizer import Tokenizer

docs = ["これはペンです", "私はあほです", "俺は男です", "あなたはサルです"]

token = []  # 各docsの分かち書きした結果を格納するリストです
training_docs = []  # TaggedDocumentを格納するリストです

for i in range(4):

    # docs[i] を分かち書きして、tokenに格納します
    t = Tokenizer()
    token.append(t.tokenize(docs[i], wakati=True))

    # TaggedDocument クラスのインスタンスを作成して、結果をtraining_docsに格納します
    # タグは "d番号"とします
    training_docs.append(TaggedDocument(words=token[i], tags=["d" + str(i)]))

# 以下に回答を作成してください
#-------------------------------------------------------
model = Doc2Vec(documents=training_docs, min_count=1)

#-------------------------------------------------------

for i in range(4):
    print(model.docvecs.most_similar("d" + str(i)))
Пример #34
0
 def __init__(self):
     self._tokenizer = Tokenizer()
Пример #35
0
 def __init__(self, text):
     self.text = text
     self.t = Tokenizer()
     self.tokens = self.t.tokenize(self.text)
     for i in self.tokens:
         print(i)
Пример #36
0
from janome.tokenizer import Tokenizer
from gensim.models import word2vec
import re

with open('sample.txt', 'r') as file:
    text = file.read()

t = Tokenizer()
results = []
r = []

lines = text.split("\n")
for line in lines:
    s = line
    s = re.sub(r'[a-zA-Z\d ]+', '', s)
    s = re.sub(r'[年月]*日', '', s)
    s = re.sub(r'[\(\):;]', '', s)
    tokens = t.tokenize(s)

    for tok in tokens:
        w = tok.surface
        r.append(w)

print(r)

wakati_file = 'kokoro.wakati'
with open(wakati_file, 'w', encoding='utf-8') as fp:
    fp.write("\n".join(results))

data = word2vec.LineSentence(wakati_file)
model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1)
Пример #37
0
class word:
    def __init__(self):
        self.t = Tokenizer()
        self.routine()

    # db/db.txtからjsonデータを取得し、ディクショナリ化する。
    def get_data(self):
        data = from_txt('./db', 'db.txt')
        data = json.loads(data)
        self.text_db[
            'cookpad_search'] = data['food_name'] + data['ingredients']

    def get_db(self):  # もっと簡単なものに。
        self.text_db = {
            'cookpad':
            'パスタを作りたい.パスタを食べたい.お腹が空いた.おいしいものが食べたい.今日のご飯何にしよう.料理を作りたい.弁当を作りたい.ピザを作りたい.ハンバーグを作りたい.辛い物が食べたい.甘いものが食べたい.塩分を取りたい.デザートを食べたい.簡単に作れるランチを知りたい.簡単なディナーを知りたい.お勧めの料理を知りたい.卵を使った料理を知りたい',
            'youtube':
            '動画を観たい.youtube を使いたい.ドラマを観たい.面白い映像を観たい.映画を観たい.音楽が聞きたい.怖い動画を見たい.急上昇1位の動画を見たい.犬の動画を見たい.猫の動画を見たい.お勧めの動画.急上昇中の動画を教えてほしい.簡単に作れるご飯の動画が見たい.眠れる動画を見たい.面接の動画.簡単な料理の動画を見たい.美味しい料理の動画を見たい.コロナについて知りたい',
            'study':
            'CCNAの勉強頑張ります.学校の勉強の仕方を知りたい.仕事で活躍する知識を身に付けたい.資格の勉強をする.基本情報試験の勉強を頑張る.音楽の勉強をしたい.免許を取りたい.英語を学びたい.ネットワークの知識を身に付けたい.セキュリティについて学びたい.アンドロイドアプリを作りたい.pythonを身に付けたい.Linuxの勉強をしたい.課題を終わらせたい'
        }  #listのlist

    def get_tokenized_db(self):
        self.texts = []  # t.tokenize(text)の結果はlist
        self.theme = []
        for theme, text in self.text_db.items():
            self.texts.append(get_words_list(text))
            self.theme.append(theme)

    def get_dictionary(self):
        self.dictionary = corpora.Dictionary(self.texts)

    def get_feature_count(self):  # tokenに何回ループしたかをカウントする。
        self.feature_cnt = len(self.dictionary.token2id)

    def get_corpus(self):  # ベクトルを作る。
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]

    def create_tfidf(self):
        self.tfidf = models.TfidfModel(self.corpus)

    def get_index(self):
        self.index = similarities.SparseMatrixSimilarity(
            self.tfidf[self.corpus], num_features=self.feature_cnt)

    def get_keyword_vector(self, user_input):
        self.kw_vector = self.dictionary.doc2bow(
            [token.surface for token in self.t.tokenize(user_input)])

    def get_similarity(self):
        return self.index[self.tfidf[self.kw_vector]]

    def routine(self):
        self.get_db()
        self.get_data()
        self.get_tokenized_db()
        self.get_dictionary()
        self.get_feature_count()
        self.get_corpus()
        self.create_tfidf()
        self.get_index()
Пример #38
0
import pandas as pd
from janome.tokenizer import Tokenizer
import re
import math

query = '吾輩は猫である'
query_words = [
    token.surface for token in Tokenizer().tokenize(query)
    if not re.fullmatch(r"[あ-ん]|、|。| ", token.surface)
]
query_file = 'query'

arr = [line.strip().split("\t") for line in open('../index/index2.txt', 'r')]

idf_scores = {a[0]: float(a[3]) for a in arr}
tfidf_scores = {w: {} for w in idf_scores}
for a in arr:
    tfidf_scores[a[0]][a[1]] = float(a[4])
tfidf_table = pd.DataFrame(tfidf_scores).fillna(0)

query_tf = {w: 0 for w in idf_scores}
for w in query_tf:
    for q in query_words:
        if w == q: query_tf[w] += 1
query_tfidf = {
    w: {
        query_file: query_tf[w] * idf_scores[w]
    }
    for w in idf_scores
}
query_table = pd.DataFrame(query_tfidf)
Пример #39
0
#AI-TECHGYM-1-10-A-3
#自然言語処理

#インポート
from gensim.models import Word2Vec
from janome.tokenizer import Tokenizer

#形態素解析のオブジェクト
text = Tokenizer()

#txtファイルからデータの読み込み
text_file = open("techgym-AI.txt")
txt = text_file.read()
 
#読み込んだデータを形態素解析
results = []
lines = txt.split("\r\n")
for i in lines:
    text_c = text.tokenize(i,wakati=True)
    results.append(text_c)

#モデル
model = Word2Vec(results, min_count=1)
vector = model.wv['プログラミング']

#ベクトル表現
print(vector)

pro = model.wv.most_similar(positive=['プログラミング'], topn=5)
#pro = model.wv.similar_by_vector('プログラミング') 
#pro = model.wv.similar_by_word('プログラミング') 
Пример #40
0
# 対象ファイルをダウンロード
url   = 'http://www.aozora.gr.jp/cards/000081/files/456_ruby_145.zip'
local = '456_ruby_145.zip'

if not os.path.exists(local):
    print('zipファイルをダウンロード')
    req.urlretrieve(url, local)

# zipファイルの内のテキストファイルを取得
zf      = zipfile.ZipFile(local, 'r')
fp      = zf.open('gingatetsudono_yoru.txt', 'r')
bindata = fp.read()
txt     = bindata.decode('shift_jis')

# 形態素解析
t        = Tokenizer()
word_dic = {}
lines    = txt.split('\r\n')

for line in lines:
    malist = t.tokenize(line)
    for w in malist:
        word = w.surface
        ps   = w.part_of_speech # 品詞
        if ps.find('名詞') < 0: continue # 名詞だけほしいのでそれ以外はスキップ
        if not word in word_dic:
            word_dic[word] = 0
        word_dic[word] += 1

keys = sorted(word_dic.items(), key=lambda x:x[1], reverse=True)
for word, cnt in keys[:50]:
Пример #41
0
class TextTokenizer:
    def __init__(self):
        self._tokenizer = Tokenizer()

    def __call__(self, text: str) -> List[str]:
        return self._tokenizer.tokenize(text, wakati=True)
Пример #42
0
 def __init__(self):
     self.janome_tokenizer = Tokenizer()
     self.exc_part_of_speech = {"名詞": ["非自立", "代名詞", "数"]}
     self.inc_part_of_speech = {"名詞": ["サ変接続", "一般", "固有名詞"]}
Пример #43
0
# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer

print(u'Tokenize (system dictionary)')
t = Tokenizer()
for token in t.tokenize(u'すもももももももものうち'):
  print(token)

print('')
print(u'Tokenize (mmap system dictionary)')
t = Tokenizer(mmap=True)
for token in t.tokenize(u'すもももももももものうち'):
  print(token)

print('')
print(u'Tokenize (wakati mode)')
for token in t.tokenize(u'すもももももももものうち', wakati = True):
  print(token)

print('')
print(u'Tokenize with user dictionary')
t = Tokenizer("user_ipadic.csv", udic_enc="utf8")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。'):
  print(token)

print('')
print(u'Tokenize with user dictionary (wakati mode)')
t = Tokenizer("user_ipadic.csv", udic_enc="utf8")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。', wakati = True):
  print(token)
 def __init__(self, user_dic_path='', user_dic_enc='utf8'):
     self._t = Tokenizer(udic=user_dic_path, udic_enc=user_dic_enc)
Пример #45
0
 def __init__(self):
     self.t = Tokenizer()
     self.routine()
Пример #46
0
# coding: utf-8
from janome.tokenizer import Tokenizer
import os, re, json, random

dict_file = "./static/js/chatbot_data.json"
dic = {}
tokenizer = Tokenizer()

def make_sentence(head):
    if not head in dic: return ""
    ret = []
    if head != "@": ret.append(head)
    top = dic[head]
    w1 = word_choice(top)
    w2 = word_choice(top[w1])
    ret.append(w1)
    ret.append(w2)
    while True:
        if w1 in dic and w2 in dic[w1]:
            w3 = word_choice(dic[w1][w2])
        else:
            w3 = ""
        ret.append(w3)
        if w3 == "。" or w3 == "?" or w3 == "": break
        w1, w2 = w2, w3
    return "".join(ret)

def word_choice(sel):
    keys = sel.keys()
    return random.choice(list(keys))
Пример #47
0
# 解析するテキストファイルを読み込む
sjis_file = 'kokoro.txt.sjis'
dic_file = 'markov-kokoro.json'

if not os.path.exists(dic_file):
    # 辞書ファイル(単語単位のngram情報)を作成する
    sjis = open(sjis_file, 'rb').read()
    text = sjis.decode('shift_jis')
    # 不要な部分を削除する
    text = re.split(r'\-{5,}', text)[2]  # ヘッダを削除
    text = re.split(r'底本:', text)[0]  # フッタを削除
    text = text.strip()
    text = text.replace('|', '')  # ルビの開始記号を削除
    text = re.sub(r'《.+?》', '', text)  # ルビを削除
    text = re.sub(r'[#.+?]', '', text)  # 入力注を削除

    # 形態素解析して辞書ファイルを作成
    t = Tokenizer()
    words = t.tokenize(text)
    dic = make_dic(words)
    json.dump(dic, open(dic_file, 'w', encoding='utf-8'))
else:
    dic = json.load(open(dic_file, 'r'))

# 作文する
for i in range(3):
    s = make_sentence(dic)
    print(s)
    print('---')
Пример #48
0
""" utils/text_tools.py

テキスト処理のユーティリティ
"""
import re
import unicodedata

from django.core.validators import validate_email

from janome.tokenizer import Tokenizer

DEFAULT_TOKENIZER = Tokenizer()


def shortnate(string, length):
    """ 文字列が既定の長さ以上だった場合に規定の長さまでで残りを省略とする """
    return string if len(string) <= length else string[:length - 4] + '...'


def get_words(text, customdict=None):
    """ 与えられたテキストを形態素解析して、含まれる名詞のリストを返す """

    def _filter(s):
        """ 名詞だけにフィルタリングする """
        reg = re.compile(r'名詞')
        ignore_reg = re.compile(r'非自立')
        if (reg.search(s.part_of_speech) and
                not ignore_reg.search(s.part_of_speech)):
            return True

    if customdict:
Пример #49
0
from janome.tokenizer import Tokenizer

t = Tokenizer()
tokens = t.tokenize("Hello World きょうはいい天気ですね。")
for tok in tokens:
    print(tok)

print(tok.part_of_speach)
Пример #50
0
def doushi(honorific):
    t = Tokenizer()
    tokens = t.tokenize(honorific)
    for token in tokens:
            # 品詞を取り出し
        partOfSpeech = token.part_of_speech.split(',')[0]
        if partOfSpeech == "動詞":
            df = pd.read_csv('doushi.csv', encoding='utf_8', names=["見出し語","尊敬語","謙譲語","丁寧語"], usecols=[0,1,2,3],skiprows=[0], skipfooter=0, engine='python')
            df= df.replace({'\n': '<br>'}, regex=True)
            df= df.replace({'\r': ''}, regex=True)
            df = df[df['見出し語']==token.surface]
            #.emptyでCSVに入力されてない見出し語の場合に以下を出力
            if df.empty:
                response_empty='<font color="red">ご指定の語句には対応しておりません</font>'
                return response_empty
            
            # if honorific==token.surface:
                
                

            #尊敬語配列
            son=df["尊敬語"].to_string(index=False).replace("\n","").replace("NaN","").replace("'","")
            s=[son]
            #謙譲語配列
            ken=df["謙譲語"].to_string(index=False).replace("\n","").replace("NaN","")
            k=[ken]
            #丁寧語配列
            tei=df["丁寧語"].to_string(index=False).replace("\n","").replace("NaN","")
            t=[tei]
            return s,k,t

            # response_string=df.drop("見出し語",axis=1).to_string(index=False)
            # response_string={df.drop("見出し語",axis=1).to_string(index=False).replace("尊敬語","").replace("謙譲語","").replace("丁寧語","")}
            # return response_string
            #pprint.pprint(df.drop("見出し語",axis=1).to_string(index=False).replace("尊敬語","").replace("謙譲語","").replace("丁寧語",""))
        
        elif partOfSpeech =='名詞':
            ds = pd.read_csv('meishi.csv', encoding='utf_8', names=["見出し語","尊敬語","謙譲語","丁寧語"], usecols=[0,1,2,3], skiprows=[0], skipfooter=0, engine='python')
            ds= ds.replace({'\n': '<br>'}, regex=True)
            ds= ds.replace({'\r': ''}, regex=True)
            ds=ds[ds['見出し語']==(token.surface)]
            if ds.empty:
                response_empty='<font color="red">ご指定の語句には対応しておりません</font>'
                return response_empty
            #尊敬語配列
            son=ds["尊敬語"].to_string(index=False).replace("\n","").replace("NaN","").replace("'","")
            s=[son]
            #謙譲語配列
            ken=ds["謙譲語"].to_string(index=False).replace("\n","").replace("NaN","")
            k=[ken]
            #丁寧語配列
            tei=ds["丁寧語"].to_string(index=False).replace("\n","").replace("NaN","")
            t=[tei]
            return s,k,t
            # response_string=ds.drop("見出し語", axis=1).to_string(index=False).replace("尊敬語","").replace("謙譲語","").replace("丁寧語","")
            # return response_string
        
        elif partOfSpeech =='助詞':
            ds = pd.read_csv('zyoshi.csv', encoding='utf_8', names=["見出し語","尊敬語","謙譲語","丁寧語"], usecols=[0,1,2,3], skiprows=[0], skipfooter=0, engine='python')
            ds= ds.replace({'\n': '<br>'}, regex=True)
            ds= ds.replace({'\r': ''}, regex=True)
            ds=ds[ds['見出し語']==(token.surface)]
            if ds.empty:
                response_empty='<font color="red">ご指定の語句には対応しておりません</font>'
                return response_empty

            #尊敬語配列
            son=ds["尊敬語"].to_string(index=False).replace("\n","").replace("NaN","").replace("'","")
            s=[son]
            #謙譲語配列
            ken=ds["謙譲語"].to_string(index=False).replace("\n","").replace("NaN","")
            k=[ken]
            #丁寧語配列
            tei=ds["丁寧語"].to_string(index=False).replace("\n","").replace("NaN","")
            t=[tei]
            return s,k,t
            # response_string=ds.drop("見出し語", axis=1).to_string(index=False).replace("尊敬語","").replace("謙譲語","").replace("丁寧語","")
            # return response_string

        else:
            if honorific:
                df = pd.read_csv('doushi.csv', encoding='utf_8', names=["見出し語","尊敬語","謙譲語","丁寧語"], usecols=[0,1,2,3],skiprows=[0], skipfooter=0, engine='python')
                df= df.replace({'\n': '<br>'}, regex=True)
                df= df.replace({'\r': ''}, regex=True)
                #janomeで解析せず、見出し語と入力された語句が一致した場合に尊敬語・謙譲語・丁寧語を出力
                df = df[df['見出し語']==honorific]
                if df.empty:
                    response_empty='<font color="red">ご指定の語句には対応しておりません</font>'
                    return response_empty
                
                son=df["尊敬語"].to_string(index=False).replace("\n","").replace("NaN","").replace("'","")
                s=[son]
                #謙譲語配列
                ken=df["謙譲語"].to_string(index=False).replace("\n","").replace("NaN","")
                k=[ken]
                #丁寧語配列
                tei=df["丁寧語"].to_string(index=False).replace("\n","").replace("NaN","")
                t=[tei]
                return s,k,t

            else:
                response_error='<font color="red">ご指定の語句には対応しておりません</font>'
                return response_error
Пример #51
0
# -*- coding: utf-8 -*-

# Copyright [2015] [moco_beta]
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from janome.tokenizer import Tokenizer

if __name__ == '__main__':
    import sys
    text = ''.join(sys.argv[1:])

    t = Tokenizer()
    tokens = t.tokenize(text)
    for token in tokens:
        print(token)

Пример #52
0
from janome.tokenizer import Tokenizer
# 形態素解析をしてください
tokenizer = Tokenizer()
tokens = tokenizer.tokenize("明日は晴れるだろうか。")
for token in tokens:
    print(token)
    print()
Пример #53
0
class Mor_analysis:
    def __init__(self, text):
        self.text = text
        self.t = Tokenizer()
        self.tokens = self.t.tokenize(self.text)
        for i in self.tokens:
            print(i)

    def divide_pos(self):
        """
        入力された文を品詞で分けて,名詞,動詞,形容詞を取り出しリストで返す
        => return (List, List, dict)
        """
        accept = []
        accept_attr = {}  # 品詞
        accept_index = []  # index

        for i, token in enumerate(self.tokens):
            part_of_speech = token.part_of_speech.split(",")[0]
            base_form = token.base_form
            if part_of_speech in ok_words:
                accept.append(base_form)
                accept_attr[base_form] = part_of_speech
                accept_index.append(i)
            else:
                self.tokens[i] = self.tokens[i].base_form

        print(accept, accept_index)
        return accept, accept_index, accept_attr

    def divide_attribute(self, attr_list, target):
        """ 同じ属性(品詞)を持つものに分ける """
        res = []
        s = " ".join(attr_list)

        tokens = self.t.tokenize(s)
        for j in tokens:
            if target == j.part_of_speech.split(",")[0]:
                res.append(j.base_form)
        return res

    def make_text(self):
        """ textを生成して返す """
        accept, accept_index, accept_attr = self.divide_pos()
        print(accept, accept_index)
        f = False  # 変更されたかを確認するためのフラグ
        for i, j in zip(accept, accept_index):
            words = self.divide_attribute(rhymes(i), accept_attr[i])
            if len(words):
                f = True
                self.tokens[j] = random.choice(words)  # ランダムで選んで入れる

        try:
            res = "".join(self.tokens)
            if not f:
                return ""
            elif res == self.text:
                return ""
            else:
                print(self.tokens)
                return res
        except TypeError:
            return ""
Пример #54
0
# -*- coding: utf-8 -*-

from janome.tokenizer import Tokenizer
import normalize_neologd as nn
from range_date import range_date
import sys
import re

# import os

reload(sys)
sys.setdefaultencoding('utf-8')

t = Tokenizer()


class newsWords:
    '''
    与えられたニュース記事を正規化し、単語に分割する
    noun: 名詞の一覧
    meta: 日付に関する単語
    '''
    def __init__(self, text):
        self.text = nn.normalize_neologd(text)
        self.words = _getWords(self.text)

    def __str__(self):
        return '[%s] noun: %s / meta: %s' % \
            (self.text,\
            ','.join(self.words[0]),\
            ','.join([str(k) + ':' + self.words[1][k] for k in self.words[1]]))
Пример #55
0
    print (line)

    try:
        tmp = line.split("screen_name")
        # print tmp[1]

        tmpstr = str(tmp[1])
        tmp = tmpstr.split(" ")
        print tmp[0]
        sc = tmp[0]

    except:
        pass

    try:
        t = Tokenizer()
        tokens = t.tokenize(line.decode("utf-8"))

        for token in tokens:
            tmpstr = str(token)
            print "tmpstr:" + tmpstr
            tmp = tmpstr.split("  ")

            print sc + ":" + tmp[0]

            # print tmp[0] + ":" + tmp2[0]
            # print token

    except:
        pass
Пример #56
0
class NaiveBayes(object):
    """
        NaiveBayes Classifier.
        use sklearn.naive_bayes.MultinomialNB
        << preprocessor >>
            corpus => NaiveBayes#tokenizer
            NaiveBayes#tokenizer := word => {Word segmentation}token
        □model training
            token => vectorizer#fit_transform => model#fit
        □predict
            token => vectorizer#transform => model#predict
    """
    def __init__(self, skip_tokenize=5):
        self.skip_tokenize = skip_tokenize
        self.skip_count = 0
        self.t = Tokenizer()
        # self.pipeline = Pipeline([
        #         ('vectorizer', TfidfVectorizer(tokenizer=self.tokenizer)),
        #         ('classifier', MultinomialNB(0.3))])
        self.pipeline = Pipeline([('vectorizer',
                                   TfidfVectorizer(tokenizer=self.tokenizer)),
                                  ('classifier', self.create_Model())])

        corpus = Serializer.load_csv('../resource/corpus.tsv')
        self.data, target = zip(*corpus)
        self.labels = np.array(target, dtype=np.uint8, ndmin=1)
        self.human_labels = None
        self.pipeline.fit(self.data, self.labels)
        #logger.debug(self.vectorizer.get_feature_names())

    def loadData(self):
        pass

    def tokenizer(self, word):
        """
            caller fit_transform / transform
            @param {string} word
            @yield {list.<string>}
                    result := token | space | token
        """
        if self.skip_count < self.skip_tokenize:
            self.skip_count += 1
            yield word
            return
        tokens = []
        for token in self.t.tokenize(word):
            if not str(token.part_of_speech).startswith('名詞'):
                pass
            tokens.append(token.surface)
        yield " ".join(tokens)

    @property
    def model(self):
        """
            @return {Classifier}
        """
        return self.pipeline.named_steps['classifier']

    @property
    def vectorizer(self):
        """
            @return {Vectorizer}
        """
        return self.pipeline.named_steps['vectorizer']

    def predict(self, x):
        """
            predict params x
            @param {string},{np.array} x
            @return predicted
        """
        if isinstance(x, str):
            x = self.vectorizer.transform([x])

        #logger.debug(self.model.predict_proba(x))
        return self.model.predict(x)

    def predict_all(self, x_list):
        """
            mapping
            @param {list} x_list
                   {dict} pair
            @return {list} value
        """
        result = []
        for x in x_list:
            predicted = self.predict(x)[0]
            value = self.human_labels[str(predicted)]
            result.append(value)
            logger.debug('%s -> 推定: %s', x, value)
        assert len(result) == len(x_list)
        return result

    def model_validation(self):
        x_train = self.vectorizer.fit_transform(self.data)
        validator = ModelValidator(x_train, self.labels)
        validator.cross_validation(self.model)
        test_params = self.model.get_params
        params = {}
        params['alpha'] = np.arange(0.01, 3., step=0.01, dtype=np.float64)
        #params['alpha'] = np.logspace(-1, 2, 30)
        params['fit_prior'] = [True, False]
        best_params = validator.search_BestParameter(self.create_Model(),
                                                     params)
        validator.cross_validation(self.create_Model(best_params))

    def create_Model(self, params=None):
        model = MultinomialNB(1)
        #model = LinearSVC(C=0.1)
        if params is not None:
            model.set_params(**params)
        return model
Пример #57
0
# -*- coding: utf-8 -*-

from janome.tokenizer import Tokenizer
from janome.dic import UserDictionary
import sysdic

print('Compile user dictionary (MeCab IPADIC format)')
user_dict = UserDictionary("user_ipadic.csv", "utf8", "ipadic", sysdic.connections)
user_dict.save("/tmp/userdic")

t = Tokenizer("/tmp/userdic")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'):
  print(token)


print('')
print('Compile user dictionary (simplified format)')
user_dict = UserDictionary("user_simpledic.csv", "utf8", "simpledic", sysdic.connections)
user_dict.save("/tmp/userdic_simple")

t = Tokenizer("/tmp/userdic_simple")
for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'):
  print(token)
Пример #58
0
# Python初心者に送る「人工知能の作り方」
# http://sandmark.hateblo.jp/entry/2017/10/07/141339
# に掲載されたコードを写経して勉強する。

import re
from janome.tokenizer import Tokenizer

TOKENIZER = Tokenizer()

def analyze(text):
    """
    文字列を形態素解析し、[(surface, parts)]の形にして返す。
    """
    return [(t.surface, t.part_of_speech) for t in TOKENIZER.tokenize(text)]

def is_keyword(part):
    """
    品詞partが学習すべきキーワードであるかどうか真偽値で返す。
    """
    return bool(re.match(r'名詞,(一般|代名詞|固有名詞|サ変接続|形容動詞語幹)', part))