def tokenize(text): t = Tokenizer() # テキストの先頭にあるヘッダとフッタを削除 text = re.split(r'\-{5,}',text)[2] text = re.split(r'底本:', text)[0] text = text.strip() # ルビを削除 text = text.replace('|', '') text = re.sub(r'《.+?》', '', text) # テキスト内の脚注を削除 text = re.sub(r'[#.+?]', '', text) # 一行ずつ処理 lines = text.split("\r\n") results = [] for line in lines: res = [] tokens = t.tokenize(line) for tok in tokens: bf = tok.base_form # 基本系 if bf == "*": bf = tok.surface ps = tok.part_of_speech # 品詞情報 hinsi = ps.split(',')[0] if hinsi in ['名詞', '動詞', '形容詞', '記号']: res.append(bf) l = " ".join(res) results.append(l) return results
def separatewords(text): separatedWord=[] t=Tokenizer() tokens=t.tokenize(unicode(text, "utf-8")) for token in tokens: posList=token.part_of_speech.split(",") pos1=posList[0] if isinstance(pos1, unicode): pos1=pos1.encode("utf-8") pos2=posList[1] if isinstance(pos2, unicode): pos2=pos2.encode("utf-8") ruby=token.reading if isinstance(ruby, unicode): ruby=ruby.encode("utf-8") if pos1=="名詞": if pos2!="接尾" and pos2!="代名詞" and pos2!="非自立" and pos2!="数" and pos2!="形容動詞語幹": if ruby!="*": separatedWord.append(token.surface.lower()) print token.surface.lower() elif pos2!="サ変接続" and len(token.surface)>3: # 英単語に関しては4文字以上の単語を扱う separatedWord.append(token.surface.lower()) print token.surface.lower() return separatedWord
def chunk_with_kanji(istr): t = Tokenizer() tokens = t.tokenize(istr) # give each element flags (jiritsu or fuzoku) flags = [judge_jifu(x.part_of_speech) for x in tokens] surface = [x.surface for x in tokens] # split to chunks, delimited by KUGIRI flag # very ugly. should be rewritten using tree structure etc. cflags = insert_chunkflg(flags) rstr = u"" i = 0 for j, f in enumerate(flags): if i >= len(cflags): break if cflags[i] == KUGIRI: if f == KUTOU: rstr += surface[j] i += 1 else: rstr += u" " rstr += surface[j] i += 2 else: rstr += surface[j] i += 1 # don't know why this is necessary if flags != [] and j == 0 and len(surface) != 1: while j < len(surface): rstr += surface[j] j += 1 return rstr
def text_to_array_ja(textdata, wordtypes): textdata = filter(textdata) t = Tokenizer() tokens = t.tokenize(textdata) words = sorted([token.surface for token in tokens if token.part_of_speech.split(',')[0] in wordtypes]) return words
def output_ja_text(data, wordtypes): textdata = filter(data) t = Tokenizer() tokens = t.tokenize(textdata) words = sorted([token.surface for token in tokens if token.part_of_speech.split(',')[0] in wordtypes]) dictionary = count_words(words) return pyaml.dump(dictionary, sys.stdout, vspacing=[0, 1])
def _tokenize(text): from collections import namedtuple Token = namedtuple("Token", ["t", "surface", "pos"]) t = Tokenizer() tokens = t.tokenize(text) for t in tokens: nt = Token(t, t.surface, t.part_of_speech.split(",")) yield nt
def makekeywords(text): from janome.tokenizer import Tokenizer t = Tokenizer() tokens = t.tokenize(text) keywords = [] for token in tokens: if token.part_of_speech.find("名詞") >= 0 and token.part_of_speech.find("数") == -1 and token.part_of_speech.find("非自立") == -1 and token.part_of_speech.find("接尾") == -1: keywords.append(token.surface) return keywords
def test_func(): t = Tokenizer() temp = "" for token in t.tokenize(u'この腫瘍は間葉系組織から生ずると考えられ、ビメンチンを発現する。'): if (not re.search('^(助詞|助動詞|記号)', token.part_of_speech)): temp = temp + token.surface else: temp = temp + token.surface print(temp) temp = ""
def split(self, text): result = [] t = Tokenizer() malist = t.tokenize(text) for w in malist: sf = w.surface # 区切られた単語そのまま bf = w.base_form # 単語の基本形 if bf == '' or bf == "*": bf = sf result.append(bf) return result
def run(self, force=None): print('start') # 全サイト取得と重複排除 sites = {} for site in Site.get_all(): sites[site.url] = site # リストに対してignoreとkeywordマッチを排除 sure = [] for key in sites: site = sites[key] response = requests.get(site.subjects_url) assert (response.status_code == 200), response.text # parse data = list(response.text.split('\n')) for line in data: try: _ = Subject(site, line) sure.append(_) except: pass print(sure) # リスト出力 t = Tokenizer() r = defaultdict(int) r2 = defaultdict(list) r3 = defaultdict(int) for _sure in sure: try: for token in t.tokenize(_sure.title): if not token_filter(token): r[token.surface] += 1 r2[token.surface] += [_sure] r3[token] += 0 except: pass # sort sure = sorted(sure, key=lambda x: x.title) for _sure in sure: try: point = 0 for token in t.tokenize(_sure.title): if not token_filter(token): point += r[token.surface] if not filter_title(point, _sure): print(_sure.title, _sure.count_res) except: pass
def understand_move(self, text): generator = Tokenizer() tokens = [] for t in generator.tokenize(text): tokens.append(t) direction = self._understand_direction(tokens) distance = self._understand_distance(tokens) return direction, distance
def main(): """ >>> main() すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ も 助詞,係助詞,*,*,*,*,も,モ,モ もも 名詞,一般,*,*,*,*,もも,モモ,モモ も 助詞,係助詞,*,*,*,*,も,モ,モ もも 名詞,一般,*,*,*,*,もも,モモ,モモ の 助詞,連体化,*,*,*,*,の,ノ,ノ うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ """ t = Tokenizer() for token in t.tokenize(u'すもももももももものうち'): print(token)
def get_morphs(string): t = Tokenizer() dicts=[] for token in t.tokenize(unicode(string, 'utf-8')): dic = {} token_list = str(token).replace(" ", ",").split(",") dic["surface"] = token_list[0] dic["base"] = token_list[7] dic["pos"] = token_list[1] dic["pos1"] = token_list[2] dicts.append(dic) return dicts
class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: from janome.tokenizer import Tokenizer except ImportError: raise ImportError("The Japanese tokenizer requires the Janome " "library: https://github.com/mocobeta/janome") self.tokenizer = Tokenizer() def __call__(self, text): words = [x.surface for x in self.tokenizer.tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # allow serialization (see #1557) def to_bytes(self, **exclude): return b'' def from_bytes(self, bytes_data, **exclude): return self def to_disk(self, path, **exclude): return None def from_disk(self, path, **exclude): return self
class MainTranslator(object): def __init__ (self): self.janome= Tokenizer() def get_gobi(self, n): f = n.part_of_speech.split(',') if n.surface in ['だ','です','た','だろ','ある']: if f[0] == '助動詞': return 'ハゲ' if n.surface in ['無い','ない','ぬ']: if f[0] == '助動詞': return 'ぬハゲ' if f[0] == '形容詞': return 'なしハゲ' def Translator(self, text): tokens = self.janome.tokenize(text) text = '' for n in tokens: f = n.part_of_speech.split(',') if n.surface in converter: text += converter[n.surface] elif len(f) > 3: gobi = self.get_gobi(n) if gobi is not None: text += gobi else: text += n.surface else: text += n.surface return text
def janome_tokenizer(sentence): t = Janome_Tokenizer() sentence = sentence.decode("utf-8") try: tokens = t.tokenize(sentence) except: try: tokens = t.tokenize(sentence.replace(u"\xa0", u"、")) except: try: tokens = t.tokenize(sentence.replace(u"\xa0", u"")) except: print ("Tokenization error at sentence: "+sentence.encode("utf-8")) return [sentence] return [dic.surface.encode("utf-8") for dic in tokens]
def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: from janome.tokenizer import Tokenizer except ImportError: raise ImportError("The Japanese tokenizer requires the Janome library: " "https://github.com/mocobeta/janome") self.tokenizer = Tokenizer()
def do_analysis(analyzed_file): result_file_name = "result/" + str(analyzed_file) p = re.compile(r'\s(.*)') t = Tokenizer() with open(analyzed_file,mode='r', encoding='utf-8') as read_file: texts = read_file.read() with open(result_file_name, mode='a', encoding='utf-8') as result_file: for token in t.tokenize(str(texts)): check_word = p.sub('',str(token)) if not word_match.word_check(check_word): result_file.write(str(token) + "\n") return result_file_name
def __init__(self): QWidget.__init__(self) self.setupUi(self) self.setWindowTitle( QApplication.translate("Widget", "%s %s" % (__app_name__, __version__), None, QApplication.UnicodeUTF8)) self.jacome_token = Tokenizer() self.words_container = []
def analyze_keyword(posts): """ 投稿を形態素解析して頻出ワードで重み付けして キーワードから出現数と投稿の逆索引を生成する。 :param posts: dict{int: Posted} :rtype: list(KeywordReverseIndex) """ t = Tokenizer() tfidf2 = defaultdict(int) tfidf2_post = defaultdict(list) # 単語毎の重み付け for key in posts: post = posts[key] for message in post.parse_post_message: # Aタグ排除 soup = BeautifulSoup(message, "lxml") # janome _prev_token = None try: for token in t.tokenize(soup.text): # tokenが助詞なら相手しない if final_filter(_prev_token, token): tfidf2[_prev_token.surface + token.surface] += 1 if post not in tfidf2_post[_prev_token.surface + token.surface]: tfidf2_post[_prev_token.surface + token.surface] += [post] _prev_token = token # tokenが助詞ならtfidf2の先頭文字から除外 if token_is_sub(token): _prev_token = None except: pass # 逆索引の生成 r_indexes = [] for key in tfidf2: _index = KeywordReverseIndex(key, tfidf2[key], tfidf2_post[key]) # 出現数が一定以上のキーワードのみindexを生成する if _index.is_enable: r_indexes.append(_index) return r_indexes
def callback(): messages = request.json['result'] for message in messages: text = message['content']['text'] for matcher, action in commands: if matcher.search(text): response = action(text) break else: post_text(message['content']['from'], '解析中...') # 形態素解析 response = '' t = Tokenizer() for token in t.tokenize(message['content']['text']): response += str(token) + '\n' post_text(message['content']['from'], response) return ''
def add_yomi(string): t = Tokenizer() tokens = t.tokenize(string) rstring = '' for token in tokens: s = token.surface r = token.reading while True: res = split_at_hiragana(s, r) if len(res) > 2: rstring += create_yomi(res[0], res[1]) s, r = res[2], res[3] else: break rstring += create_yomi(res[0], res[1]) return rstring
class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: from janome.tokenizer import Tokenizer except ImportError: raise ImportError("The Japanese tokenizer requires the Janome library: " "https://github.com/mocobeta/janome") self.tokenizer = Tokenizer() def __call__(self, text): words = [x.surface for x in self.tokenizer.tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words))
class MyWindow(QWidget, Ui_window): loaded_words = Signal(str) refresh_words = Signal() def __init__(self): QWidget.__init__(self) self.setupUi(self) self.setWindowTitle( QApplication.translate("Widget", "%s %s" % (__app_name__, __version__), None, QApplication.UnicodeUTF8)) self.jacome_token = Tokenizer() self.words_container = [] @Slot() def analysis_janome(self, s): # デバッグ用に解析文章をstdout print s # 解析結果の表示要素を初期化(クリア) self.refresh_words.emit() # 解析結果格納配列を初期化(csv保存用) self.words_container = [] # 形態素解析を実行 tokens = self.jacome_token.tokenize(s) for token in tokens: # 解析結果をstring(UNICODE)型へキャスト print_str = str(token).decode('utf8') # csv保存用に解析結果を格納 self.words_container.append(print_str) # 解析結果を出力 self.loaded_words.emit('\n'.join(self.words_container)) @Slot() def save_csv(self): filename = 'result.csv' filename = os.path.normpath(filename) # OSを判定してエンコードを設定 if os.name is 'nt': code = 'cp932' else: code = 'utf-8' print 'save_csv: code = %s' % code with open(filename, 'wb') as f: # 'wb'じゃないと変な改行入る。 writer = csv.writer(f, delimiter=',') for words in self.words_container: out_word = words.encode(code) writer.writerow([out_word])
def __init__(self, text): # テキストをいい感じにする if isinstance(text, str): text = text.decode("utf-8") # 半角記号を削除 half_symbol = re.compile("[!-/:-@[-`{-~]") text = half_symbol.sub("", text) self.text = text self.t = Tokenizer(wakati=True) # 生成する文章の数を指定 self.sentence_num = 5 # 生成する文章の文字数の大ざっぱな上限 self.stop_length = 110
class Mave(object): def __init__(self, name=u'メイ'): self.name = name self.msg_que = Queue() self.tokenizer = Tokenizer() self.markov = Markov(ngram=2) def wakeUp(self): try: self.markov.load(u'mave_%s.json' % self.name) except Exception as e: print 'markov load failure' print e self.markov = Markov(ngram=2) def goToBed(self): self.markov.save(u'mave_%s.json' % self.name) def listenTo(self, message, talker): tokens = self.tokenizer.tokenize(message.decode('utf-8')) for tok in tokens: print '%10s (%10s) ... %s' % (tok.surface, tok.reading, tok.part_of_speech) self.markov.learn(tokens) meishi_list = [tok.surface for tok in tokens if u'名詞' in tok.part_of_speech.split(',') and ((u'一般' in tok.part_of_speech.split(',') and not u'あ' <= tok.surface[0] <= u'ん') or u'固有名詞' in tok.part_of_speech.split(','))] key = random.choice(meishi_list) if len(meishi_list) != 0 else None rsp = self.markov.generate(key) if rsp != None: self.msg_que.put(rsp) else: self.msg_que.put('はいはい > %s' % talker) def speak(self): if self.msg_que.empty(): return None return self.msg_que.get()
def __init__(self): self.tokenizer = Tokenizer(wakati=True) self.excludes = ["。", "、", "(", ")"] self.exclude_nodes = ["cite", "script", "style"]
from pathlib import Path import csv, json import pandas as pd import glob fo_names = ['test_txt', 'csv'] for i in fo_names: try: os.makedirs('../{}'.format(i)) except: continue file_number = 0 t = Tokenizer() path = Path(sys.argv[1] if len(sys.argv) >= 2 else '.') for path_in in [x for x in path.glob('*.txt') if x.is_file()]: path_out = path_in.with_suffix('.txt') path_csv = path_in.with_suffix('.csv') file = open(path_in, 'r') file_number += 1 bunsyou = file.readlines() syori_bunsyou = [] moji = [] mojisuu = 0 for i in range(len(bunsyou)): bunsyou[i] = bunsyou[i].strip() syori_bunsyou.append(bunsyou[i].strip())
class Dictionary: """ 思考エンジンのクラス。 クラス変数: DICT_RANDOM -- ランダム辞書のファイル名。 DICT_PATTERN -- パターン辞書のファイル名。 TOKENIZER -- 形態素解析ツールjanomeの分析オブジェクト プロパティ: random -- ランダム辞書 pattern -- パターン辞書 """ DICT_RANDOM = 'dics/random.txt' DICT_PATTERN = 'dics/pattern.txt' TOKENIZER = Tokenizer() def __init__(self): """ ファイルからの辞書の読み込みを行う。 """ with open(Dictionary.DICT_RANDOM, encoding='utf-8') as f: self._random = [x for x in f.read().splitlines() if x] with open(Dictionary.DICT_PATTERN, encoding='utf-8') as f: self._pattern = [ Dictionary.make_pattern(l) for l in f.read().splitlines() if l ] def study(self, text): """ ランダム辞書、パターン辞書をメモリに保存する。 """ self.study_random(text) self.study_pattern(text, Dictionary.analyze(text)) def study_random(self, text): """ ユーザの発言textをメモリに保存する。 すでに同じ発言があった場合は何もしない。 """ if not text in self._random: self._random.append(text) def study_pattern(self, text, parts): """ ユーザの発言textを形態素partsに基づいてパターン辞書に保存する。 """ for word, part in parts: if self.is_keyword(part): # 品詞が名詞であれば学習。 # 単語の重複チェック # 同じ単語で登録されていれば、パターンを追加する # 無ければ新しいパターンを作成する duplicated = next( (p for p in self._pattern if p['pattern'] == word), None) if duplicated: if not text in duplicated['phrases']: duplicated['phrases'].append(text) else: self._pattern.append({'pattern': word, 'phrases': [text]}) def save(self): """ メモリ上の辞書をファイルに保存する。 """ with open(Dictionary.DICT_RANDOM, mode='w', encoding='utf-8') as f: f.write('\n'.join(self.random)) @staticmethod def make_pattern(line): """ 文字列lineを\tで分割し、{'pattern':[0], 'pharases':[1]}の形式で返す。 """ pattern, phrases = line.split('\t') if pattern and phrases: return {'pattern': pattern, 'phrases': phrases.split('|')} @staticmethod def analyze(text): """ 文字列を形態素解析し、[(surface, parts)]の形にして返す。 """ return [(t.surface, t.part_of_speech) for t in Dictionary.TOKENIZER.tokenize(text)] @staticmethod def pattern_to_line(pattern): """ パターンのハッシュを文字列に変換する。 """ return '{}\t{}'.format(pattern['pattern'], '|'.join(pattern['phrases'])) @staticmethod def is_keyword(part): """ 品詞partが学習すべきキーワードであるかどうか真偽値で返す。 """ return bool(re.match(r'名詞,(一般|代名詞|固有名詞|サ変接続|形容動詞語幹)', part)) @property def random(self): """ ランダム辞書 """ return self._random @property def pattern(self): """ パターン辞書 """ return self._pattern
import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from janome.tokenizer import Tokenizer #参考にしたページ・ソースコード #https://blog.amedama.jp/entry/tf-idf #tokenizerの初期化 janome_tokenizer = Tokenizer() def text_morpheme( text, part = "", part2 = ""): """janomeで形態素に分ける Arguments: text {[type]} -- 形態素に分ける文字列 Keyword Arguments: part {str} -- 取得する品詞を指定(品詞の設定がない場合はすべて取得) part2 {str} -- サ変名詞などの2つ目の品詞 Returns: [type] -- 形態素に分けた結果(リストで返す) """ text_list = [] for token in janome_tokenizer.tokenize(text): #print(token.part_of_speech)
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer import sys from io import open PY3 = sys.version_info[0] == 3 print(u'Tokenize (stream mode)') t = Tokenizer(mmap=True) with open('text_lemon.txt', encoding='utf-8') as f: text = f.read() if not PY3: text = unicode(text) for token in t.tokenize(text, stream=True): print(token)
from janome.tokenizer import Tokenizer from gensim.models import word2vec import re # テキストファイルの読み込み bindata = open('kokoro.txt.sjis', 'rb').read() text = bindata.decode('shift_jis') # テキストの先頭にあるヘッダーとフッターを削除 text = re.split(r'\-{5,}', text)[2] text = re.split(r'底本:', text)[0] text = text.strip() # 形態素解析 t = Tokenizer() results = [] # テキストを一行ずつ処理する lines = text.split("\r\n") for line in lines: s = line s = s.replace('|', '') s = re.sub(r'《.+?》', '', s) # ルビを削除 s = re.sub(r'[#.+?]', '', s) # 入力注を削除 tokens = t.tokenize(s) # 形態素解析 # 必要な語句だけを対象とする r = [] for tok in tokens: if tok.base_form == "*": # 単語の基本形を採用 w = tok.surface else: w = tok.base_form
from gensim.models.doc2vec import Doc2Vec from gensim.models.doc2vec import TaggedDocument from janome.tokenizer import Tokenizer docs = ["これはペンです", "私はあほです", "俺は男です", "あなたはサルです"] token = [] # 各docsの分かち書きした結果を格納するリストです training_docs = [] # TaggedDocumentを格納するリストです for i in range(4): # docs[i] を分かち書きして、tokenに格納します t = Tokenizer() token.append(t.tokenize(docs[i], wakati=True)) # TaggedDocument クラスのインスタンスを作成して、結果をtraining_docsに格納します # タグは "d番号"とします training_docs.append(TaggedDocument(words=token[i], tags=["d" + str(i)])) # 以下に回答を作成してください #------------------------------------------------------- model = Doc2Vec(documents=training_docs, min_count=1) #------------------------------------------------------- for i in range(4): print(model.docvecs.most_similar("d" + str(i)))
def __init__(self): self._tokenizer = Tokenizer()
def __init__(self, text): self.text = text self.t = Tokenizer() self.tokens = self.t.tokenize(self.text) for i in self.tokens: print(i)
from janome.tokenizer import Tokenizer from gensim.models import word2vec import re with open('sample.txt', 'r') as file: text = file.read() t = Tokenizer() results = [] r = [] lines = text.split("\n") for line in lines: s = line s = re.sub(r'[a-zA-Z\d ]+', '', s) s = re.sub(r'[年月]*日', '', s) s = re.sub(r'[\(\):;]', '', s) tokens = t.tokenize(s) for tok in tokens: w = tok.surface r.append(w) print(r) wakati_file = 'kokoro.wakati' with open(wakati_file, 'w', encoding='utf-8') as fp: fp.write("\n".join(results)) data = word2vec.LineSentence(wakati_file) model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1)
class word: def __init__(self): self.t = Tokenizer() self.routine() # db/db.txtからjsonデータを取得し、ディクショナリ化する。 def get_data(self): data = from_txt('./db', 'db.txt') data = json.loads(data) self.text_db[ 'cookpad_search'] = data['food_name'] + data['ingredients'] def get_db(self): # もっと簡単なものに。 self.text_db = { 'cookpad': 'パスタを作りたい.パスタを食べたい.お腹が空いた.おいしいものが食べたい.今日のご飯何にしよう.料理を作りたい.弁当を作りたい.ピザを作りたい.ハンバーグを作りたい.辛い物が食べたい.甘いものが食べたい.塩分を取りたい.デザートを食べたい.簡単に作れるランチを知りたい.簡単なディナーを知りたい.お勧めの料理を知りたい.卵を使った料理を知りたい', 'youtube': '動画を観たい.youtube を使いたい.ドラマを観たい.面白い映像を観たい.映画を観たい.音楽が聞きたい.怖い動画を見たい.急上昇1位の動画を見たい.犬の動画を見たい.猫の動画を見たい.お勧めの動画.急上昇中の動画を教えてほしい.簡単に作れるご飯の動画が見たい.眠れる動画を見たい.面接の動画.簡単な料理の動画を見たい.美味しい料理の動画を見たい.コロナについて知りたい', 'study': 'CCNAの勉強頑張ります.学校の勉強の仕方を知りたい.仕事で活躍する知識を身に付けたい.資格の勉強をする.基本情報試験の勉強を頑張る.音楽の勉強をしたい.免許を取りたい.英語を学びたい.ネットワークの知識を身に付けたい.セキュリティについて学びたい.アンドロイドアプリを作りたい.pythonを身に付けたい.Linuxの勉強をしたい.課題を終わらせたい' } #listのlist def get_tokenized_db(self): self.texts = [] # t.tokenize(text)の結果はlist self.theme = [] for theme, text in self.text_db.items(): self.texts.append(get_words_list(text)) self.theme.append(theme) def get_dictionary(self): self.dictionary = corpora.Dictionary(self.texts) def get_feature_count(self): # tokenに何回ループしたかをカウントする。 self.feature_cnt = len(self.dictionary.token2id) def get_corpus(self): # ベクトルを作る。 self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] def create_tfidf(self): self.tfidf = models.TfidfModel(self.corpus) def get_index(self): self.index = similarities.SparseMatrixSimilarity( self.tfidf[self.corpus], num_features=self.feature_cnt) def get_keyword_vector(self, user_input): self.kw_vector = self.dictionary.doc2bow( [token.surface for token in self.t.tokenize(user_input)]) def get_similarity(self): return self.index[self.tfidf[self.kw_vector]] def routine(self): self.get_db() self.get_data() self.get_tokenized_db() self.get_dictionary() self.get_feature_count() self.get_corpus() self.create_tfidf() self.get_index()
import pandas as pd from janome.tokenizer import Tokenizer import re import math query = '吾輩は猫である' query_words = [ token.surface for token in Tokenizer().tokenize(query) if not re.fullmatch(r"[あ-ん]|、|。| ", token.surface) ] query_file = 'query' arr = [line.strip().split("\t") for line in open('../index/index2.txt', 'r')] idf_scores = {a[0]: float(a[3]) for a in arr} tfidf_scores = {w: {} for w in idf_scores} for a in arr: tfidf_scores[a[0]][a[1]] = float(a[4]) tfidf_table = pd.DataFrame(tfidf_scores).fillna(0) query_tf = {w: 0 for w in idf_scores} for w in query_tf: for q in query_words: if w == q: query_tf[w] += 1 query_tfidf = { w: { query_file: query_tf[w] * idf_scores[w] } for w in idf_scores } query_table = pd.DataFrame(query_tfidf)
#AI-TECHGYM-1-10-A-3 #自然言語処理 #インポート from gensim.models import Word2Vec from janome.tokenizer import Tokenizer #形態素解析のオブジェクト text = Tokenizer() #txtファイルからデータの読み込み text_file = open("techgym-AI.txt") txt = text_file.read() #読み込んだデータを形態素解析 results = [] lines = txt.split("\r\n") for i in lines: text_c = text.tokenize(i,wakati=True) results.append(text_c) #モデル model = Word2Vec(results, min_count=1) vector = model.wv['プログラミング'] #ベクトル表現 print(vector) pro = model.wv.most_similar(positive=['プログラミング'], topn=5) #pro = model.wv.similar_by_vector('プログラミング') #pro = model.wv.similar_by_word('プログラミング')
# 対象ファイルをダウンロード url = 'http://www.aozora.gr.jp/cards/000081/files/456_ruby_145.zip' local = '456_ruby_145.zip' if not os.path.exists(local): print('zipファイルをダウンロード') req.urlretrieve(url, local) # zipファイルの内のテキストファイルを取得 zf = zipfile.ZipFile(local, 'r') fp = zf.open('gingatetsudono_yoru.txt', 'r') bindata = fp.read() txt = bindata.decode('shift_jis') # 形態素解析 t = Tokenizer() word_dic = {} lines = txt.split('\r\n') for line in lines: malist = t.tokenize(line) for w in malist: word = w.surface ps = w.part_of_speech # 品詞 if ps.find('名詞') < 0: continue # 名詞だけほしいのでそれ以外はスキップ if not word in word_dic: word_dic[word] = 0 word_dic[word] += 1 keys = sorted(word_dic.items(), key=lambda x:x[1], reverse=True) for word, cnt in keys[:50]:
class TextTokenizer: def __init__(self): self._tokenizer = Tokenizer() def __call__(self, text: str) -> List[str]: return self._tokenizer.tokenize(text, wakati=True)
def __init__(self): self.janome_tokenizer = Tokenizer() self.exc_part_of_speech = {"名詞": ["非自立", "代名詞", "数"]} self.inc_part_of_speech = {"名詞": ["サ変接続", "一般", "固有名詞"]}
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer print(u'Tokenize (system dictionary)') t = Tokenizer() for token in t.tokenize(u'すもももももももものうち'): print(token) print('') print(u'Tokenize (mmap system dictionary)') t = Tokenizer(mmap=True) for token in t.tokenize(u'すもももももももものうち'): print(token) print('') print(u'Tokenize (wakati mode)') for token in t.tokenize(u'すもももももももものうち', wakati = True): print(token) print('') print(u'Tokenize with user dictionary') t = Tokenizer("user_ipadic.csv", udic_enc="utf8") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。'): print(token) print('') print(u'Tokenize with user dictionary (wakati mode)') t = Tokenizer("user_ipadic.csv", udic_enc="utf8") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。', wakati = True): print(token)
def __init__(self, user_dic_path='', user_dic_enc='utf8'): self._t = Tokenizer(udic=user_dic_path, udic_enc=user_dic_enc)
def __init__(self): self.t = Tokenizer() self.routine()
# coding: utf-8 from janome.tokenizer import Tokenizer import os, re, json, random dict_file = "./static/js/chatbot_data.json" dic = {} tokenizer = Tokenizer() def make_sentence(head): if not head in dic: return "" ret = [] if head != "@": ret.append(head) top = dic[head] w1 = word_choice(top) w2 = word_choice(top[w1]) ret.append(w1) ret.append(w2) while True: if w1 in dic and w2 in dic[w1]: w3 = word_choice(dic[w1][w2]) else: w3 = "" ret.append(w3) if w3 == "。" or w3 == "?" or w3 == "": break w1, w2 = w2, w3 return "".join(ret) def word_choice(sel): keys = sel.keys() return random.choice(list(keys))
# 解析するテキストファイルを読み込む sjis_file = 'kokoro.txt.sjis' dic_file = 'markov-kokoro.json' if not os.path.exists(dic_file): # 辞書ファイル(単語単位のngram情報)を作成する sjis = open(sjis_file, 'rb').read() text = sjis.decode('shift_jis') # 不要な部分を削除する text = re.split(r'\-{5,}', text)[2] # ヘッダを削除 text = re.split(r'底本:', text)[0] # フッタを削除 text = text.strip() text = text.replace('|', '') # ルビの開始記号を削除 text = re.sub(r'《.+?》', '', text) # ルビを削除 text = re.sub(r'[#.+?]', '', text) # 入力注を削除 # 形態素解析して辞書ファイルを作成 t = Tokenizer() words = t.tokenize(text) dic = make_dic(words) json.dump(dic, open(dic_file, 'w', encoding='utf-8')) else: dic = json.load(open(dic_file, 'r')) # 作文する for i in range(3): s = make_sentence(dic) print(s) print('---')
""" utils/text_tools.py テキスト処理のユーティリティ """ import re import unicodedata from django.core.validators import validate_email from janome.tokenizer import Tokenizer DEFAULT_TOKENIZER = Tokenizer() def shortnate(string, length): """ 文字列が既定の長さ以上だった場合に規定の長さまでで残りを省略とする """ return string if len(string) <= length else string[:length - 4] + '...' def get_words(text, customdict=None): """ 与えられたテキストを形態素解析して、含まれる名詞のリストを返す """ def _filter(s): """ 名詞だけにフィルタリングする """ reg = re.compile(r'名詞') ignore_reg = re.compile(r'非自立') if (reg.search(s.part_of_speech) and not ignore_reg.search(s.part_of_speech)): return True if customdict:
from janome.tokenizer import Tokenizer t = Tokenizer() tokens = t.tokenize("Hello World きょうはいい天気ですね。") for tok in tokens: print(tok) print(tok.part_of_speach)
def doushi(honorific): t = Tokenizer() tokens = t.tokenize(honorific) for token in tokens: # 品詞を取り出し partOfSpeech = token.part_of_speech.split(',')[0] if partOfSpeech == "動詞": df = pd.read_csv('doushi.csv', encoding='utf_8', names=["見出し語","尊敬語","謙譲語","丁寧語"], usecols=[0,1,2,3],skiprows=[0], skipfooter=0, engine='python') df= df.replace({'\n': '<br>'}, regex=True) df= df.replace({'\r': ''}, regex=True) df = df[df['見出し語']==token.surface] #.emptyでCSVに入力されてない見出し語の場合に以下を出力 if df.empty: response_empty='<font color="red">ご指定の語句には対応しておりません</font>' return response_empty # if honorific==token.surface: #尊敬語配列 son=df["尊敬語"].to_string(index=False).replace("\n","").replace("NaN","").replace("'","") s=[son] #謙譲語配列 ken=df["謙譲語"].to_string(index=False).replace("\n","").replace("NaN","") k=[ken] #丁寧語配列 tei=df["丁寧語"].to_string(index=False).replace("\n","").replace("NaN","") t=[tei] return s,k,t # response_string=df.drop("見出し語",axis=1).to_string(index=False) # response_string={df.drop("見出し語",axis=1).to_string(index=False).replace("尊敬語","").replace("謙譲語","").replace("丁寧語","")} # return response_string #pprint.pprint(df.drop("見出し語",axis=1).to_string(index=False).replace("尊敬語","").replace("謙譲語","").replace("丁寧語","")) elif partOfSpeech =='名詞': ds = pd.read_csv('meishi.csv', encoding='utf_8', names=["見出し語","尊敬語","謙譲語","丁寧語"], usecols=[0,1,2,3], skiprows=[0], skipfooter=0, engine='python') ds= ds.replace({'\n': '<br>'}, regex=True) ds= ds.replace({'\r': ''}, regex=True) ds=ds[ds['見出し語']==(token.surface)] if ds.empty: response_empty='<font color="red">ご指定の語句には対応しておりません</font>' return response_empty #尊敬語配列 son=ds["尊敬語"].to_string(index=False).replace("\n","").replace("NaN","").replace("'","") s=[son] #謙譲語配列 ken=ds["謙譲語"].to_string(index=False).replace("\n","").replace("NaN","") k=[ken] #丁寧語配列 tei=ds["丁寧語"].to_string(index=False).replace("\n","").replace("NaN","") t=[tei] return s,k,t # response_string=ds.drop("見出し語", axis=1).to_string(index=False).replace("尊敬語","").replace("謙譲語","").replace("丁寧語","") # return response_string elif partOfSpeech =='助詞': ds = pd.read_csv('zyoshi.csv', encoding='utf_8', names=["見出し語","尊敬語","謙譲語","丁寧語"], usecols=[0,1,2,3], skiprows=[0], skipfooter=0, engine='python') ds= ds.replace({'\n': '<br>'}, regex=True) ds= ds.replace({'\r': ''}, regex=True) ds=ds[ds['見出し語']==(token.surface)] if ds.empty: response_empty='<font color="red">ご指定の語句には対応しておりません</font>' return response_empty #尊敬語配列 son=ds["尊敬語"].to_string(index=False).replace("\n","").replace("NaN","").replace("'","") s=[son] #謙譲語配列 ken=ds["謙譲語"].to_string(index=False).replace("\n","").replace("NaN","") k=[ken] #丁寧語配列 tei=ds["丁寧語"].to_string(index=False).replace("\n","").replace("NaN","") t=[tei] return s,k,t # response_string=ds.drop("見出し語", axis=1).to_string(index=False).replace("尊敬語","").replace("謙譲語","").replace("丁寧語","") # return response_string else: if honorific: df = pd.read_csv('doushi.csv', encoding='utf_8', names=["見出し語","尊敬語","謙譲語","丁寧語"], usecols=[0,1,2,3],skiprows=[0], skipfooter=0, engine='python') df= df.replace({'\n': '<br>'}, regex=True) df= df.replace({'\r': ''}, regex=True) #janomeで解析せず、見出し語と入力された語句が一致した場合に尊敬語・謙譲語・丁寧語を出力 df = df[df['見出し語']==honorific] if df.empty: response_empty='<font color="red">ご指定の語句には対応しておりません</font>' return response_empty son=df["尊敬語"].to_string(index=False).replace("\n","").replace("NaN","").replace("'","") s=[son] #謙譲語配列 ken=df["謙譲語"].to_string(index=False).replace("\n","").replace("NaN","") k=[ken] #丁寧語配列 tei=df["丁寧語"].to_string(index=False).replace("\n","").replace("NaN","") t=[tei] return s,k,t else: response_error='<font color="red">ご指定の語句には対応しておりません</font>' return response_error
# -*- coding: utf-8 -*- # Copyright [2015] [moco_beta] # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from janome.tokenizer import Tokenizer if __name__ == '__main__': import sys text = ''.join(sys.argv[1:]) t = Tokenizer() tokens = t.tokenize(text) for token in tokens: print(token)
from janome.tokenizer import Tokenizer # 形態素解析をしてください tokenizer = Tokenizer() tokens = tokenizer.tokenize("明日は晴れるだろうか。") for token in tokens: print(token) print()
class Mor_analysis: def __init__(self, text): self.text = text self.t = Tokenizer() self.tokens = self.t.tokenize(self.text) for i in self.tokens: print(i) def divide_pos(self): """ 入力された文を品詞で分けて,名詞,動詞,形容詞を取り出しリストで返す => return (List, List, dict) """ accept = [] accept_attr = {} # 品詞 accept_index = [] # index for i, token in enumerate(self.tokens): part_of_speech = token.part_of_speech.split(",")[0] base_form = token.base_form if part_of_speech in ok_words: accept.append(base_form) accept_attr[base_form] = part_of_speech accept_index.append(i) else: self.tokens[i] = self.tokens[i].base_form print(accept, accept_index) return accept, accept_index, accept_attr def divide_attribute(self, attr_list, target): """ 同じ属性(品詞)を持つものに分ける """ res = [] s = " ".join(attr_list) tokens = self.t.tokenize(s) for j in tokens: if target == j.part_of_speech.split(",")[0]: res.append(j.base_form) return res def make_text(self): """ textを生成して返す """ accept, accept_index, accept_attr = self.divide_pos() print(accept, accept_index) f = False # 変更されたかを確認するためのフラグ for i, j in zip(accept, accept_index): words = self.divide_attribute(rhymes(i), accept_attr[i]) if len(words): f = True self.tokens[j] = random.choice(words) # ランダムで選んで入れる try: res = "".join(self.tokens) if not f: return "" elif res == self.text: return "" else: print(self.tokens) return res except TypeError: return ""
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer import normalize_neologd as nn from range_date import range_date import sys import re # import os reload(sys) sys.setdefaultencoding('utf-8') t = Tokenizer() class newsWords: ''' 与えられたニュース記事を正規化し、単語に分割する noun: 名詞の一覧 meta: 日付に関する単語 ''' def __init__(self, text): self.text = nn.normalize_neologd(text) self.words = _getWords(self.text) def __str__(self): return '[%s] noun: %s / meta: %s' % \ (self.text,\ ','.join(self.words[0]),\ ','.join([str(k) + ':' + self.words[1][k] for k in self.words[1]]))
print (line) try: tmp = line.split("screen_name") # print tmp[1] tmpstr = str(tmp[1]) tmp = tmpstr.split(" ") print tmp[0] sc = tmp[0] except: pass try: t = Tokenizer() tokens = t.tokenize(line.decode("utf-8")) for token in tokens: tmpstr = str(token) print "tmpstr:" + tmpstr tmp = tmpstr.split(" ") print sc + ":" + tmp[0] # print tmp[0] + ":" + tmp2[0] # print token except: pass
class NaiveBayes(object): """ NaiveBayes Classifier. use sklearn.naive_bayes.MultinomialNB << preprocessor >> corpus => NaiveBayes#tokenizer NaiveBayes#tokenizer := word => {Word segmentation}token □model training token => vectorizer#fit_transform => model#fit □predict token => vectorizer#transform => model#predict """ def __init__(self, skip_tokenize=5): self.skip_tokenize = skip_tokenize self.skip_count = 0 self.t = Tokenizer() # self.pipeline = Pipeline([ # ('vectorizer', TfidfVectorizer(tokenizer=self.tokenizer)), # ('classifier', MultinomialNB(0.3))]) self.pipeline = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=self.tokenizer)), ('classifier', self.create_Model())]) corpus = Serializer.load_csv('../resource/corpus.tsv') self.data, target = zip(*corpus) self.labels = np.array(target, dtype=np.uint8, ndmin=1) self.human_labels = None self.pipeline.fit(self.data, self.labels) #logger.debug(self.vectorizer.get_feature_names()) def loadData(self): pass def tokenizer(self, word): """ caller fit_transform / transform @param {string} word @yield {list.<string>} result := token | space | token """ if self.skip_count < self.skip_tokenize: self.skip_count += 1 yield word return tokens = [] for token in self.t.tokenize(word): if not str(token.part_of_speech).startswith('名詞'): pass tokens.append(token.surface) yield " ".join(tokens) @property def model(self): """ @return {Classifier} """ return self.pipeline.named_steps['classifier'] @property def vectorizer(self): """ @return {Vectorizer} """ return self.pipeline.named_steps['vectorizer'] def predict(self, x): """ predict params x @param {string},{np.array} x @return predicted """ if isinstance(x, str): x = self.vectorizer.transform([x]) #logger.debug(self.model.predict_proba(x)) return self.model.predict(x) def predict_all(self, x_list): """ mapping @param {list} x_list {dict} pair @return {list} value """ result = [] for x in x_list: predicted = self.predict(x)[0] value = self.human_labels[str(predicted)] result.append(value) logger.debug('%s -> 推定: %s', x, value) assert len(result) == len(x_list) return result def model_validation(self): x_train = self.vectorizer.fit_transform(self.data) validator = ModelValidator(x_train, self.labels) validator.cross_validation(self.model) test_params = self.model.get_params params = {} params['alpha'] = np.arange(0.01, 3., step=0.01, dtype=np.float64) #params['alpha'] = np.logspace(-1, 2, 30) params['fit_prior'] = [True, False] best_params = validator.search_BestParameter(self.create_Model(), params) validator.cross_validation(self.create_Model(best_params)) def create_Model(self, params=None): model = MultinomialNB(1) #model = LinearSVC(C=0.1) if params is not None: model.set_params(**params) return model
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer from janome.dic import UserDictionary import sysdic print('Compile user dictionary (MeCab IPADIC format)') user_dict = UserDictionary("user_ipadic.csv", "utf8", "ipadic", sysdic.connections) user_dict.save("/tmp/userdic") t = Tokenizer("/tmp/userdic") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'): print(token) print('') print('Compile user dictionary (simplified format)') user_dict = UserDictionary("user_simpledic.csv", "utf8", "simpledic", sysdic.connections) user_dict.save("/tmp/userdic_simple") t = Tokenizer("/tmp/userdic_simple") for token in t.tokenize(u'東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便 利です。'): print(token)
# Python初心者に送る「人工知能の作り方」 # http://sandmark.hateblo.jp/entry/2017/10/07/141339 # に掲載されたコードを写経して勉強する。 import re from janome.tokenizer import Tokenizer TOKENIZER = Tokenizer() def analyze(text): """ 文字列を形態素解析し、[(surface, parts)]の形にして返す。 """ return [(t.surface, t.part_of_speech) for t in TOKENIZER.tokenize(text)] def is_keyword(part): """ 品詞partが学習すべきキーワードであるかどうか真偽値で返す。 """ return bool(re.match(r'名詞,(一般|代名詞|固有名詞|サ変接続|形容動詞語幹)', part))