print("おそらく書き込みは完了") """ print("データを解析中だと考える。") #加工データを変数に入れて、その変数を単語頻度のjanomeに投げるやつ。 #変数はsとする #makefile2は任意に入力したtextデータタイトルをテキストデータとしての文字列に変更したもの makefile = "パリ" makefile2 = ""+ makefile + ".txt" f = open(makefile2) s = f.read() f.close() a = Analyzer(token_filters=[POSKeepFilter(['名詞']), TokenCountFilter()]) g_count = a.analyze(s) #リスト化させる。 l_count = list(a.analyze(s)) #print(type(g_count)) #print(type(l_count)) # <class 'generator'> #全て表記させる。 #for i in g_count: # print(i) """ print("sort前") print(l_count)
if __name__ == '__main__': f_name = 'full_conv.txt' path = 'data/'+f_name texts = [l.strip() for l in open(path, 'r', encoding='utf8') if l!='\n'] flag = sys.argv[1] print(flag) if flag == 'janome': # tokenizer = Tokenizer(mmap=True) tokenizer = Tokenizer() char_filters = [UnicodeNormalizeCharFilter()] token_filters = [LowerCaseFilter(), ExtractAttributeFilter(att='surface')] analyzer = Analyzer(char_filters, tokenizer, token_filters) comp = [] for l in texts: sn = [token for token in analyzer.analyze(l)] sn = ['<start>'] + sn + ['<end>'] comp.append(sn) del analyzer del token_filters del char_filters del tokenizer else: mpath = 'models/sentensepice' template = '--input=%s --model_prefix=%s --vocab_size=8000' spm.SentencePieceTrainer.train(template%(path, mpath)) sp = spm.SentencePieceProcessor() sp.load(mpath+'.model') comp = [] for l in texts:
text = re.sub(r'([.*?])|(《.*?》)', '', text) # 神の存在、及び人間の霊魂と肉体との区別を論証する、第一哲学についての省察 ... # pip install Janome from janome.analyzer import Analyzer from janome.charfilter import UnicodeNormalizeCharFilter from janome.tokenizer import Tokenizer from janome.tokenfilter import POSKeepFilter, ExtractAttributeFilter analyzer = Analyzer( [UnicodeNormalizeCharFilter()], Tokenizer(), [POSKeepFilter(['名詞', '動詞', '形容詞', '副詞']), ExtractAttributeFilter('base_form')] ) tokens = [token for token in analyzer.analyze(text)] # ['神', '存在', '人間', '霊魂', '肉体', ... ] with open('./input.txt', 'w') as f: f.write(' '.join(tokens)) with open('./vectors.txt', 'r') as original, open('./gensim_vectors.txt', 'w') as transformed: vocab_count = vectors.shape[0] # 単語数 size = vectors.shape[1] # 次元数 transformed.write(f'{vocab_count} {size}\n') transformed.write(original.read()) # 2行目以降はそのまま出力 from gensim.models import KeyedVectors glove_vectors = KeyedVectors.load_word2vec_format('./gensim_vectors.txt', binary=False)
###############################ETL################################# text = tweet.copy() char_filters = [UnicodeNormalizeCharFilter()] tokenizer = Tokenizer() token_filters = [CompoundNounFilter(), LowerCaseFilter()] #POSKeepFilter(['名詞','形容詞']), a = Analyzer(char_filters, tokenizer, token_filters) tdesc = [] for i in range(len(text)): newsen = '' #mySent = re.sub('[?•()()_→【】|...”「、>:」!,."...%*-]', ' ', text[i]) mySent = text[i] #mySent = mySent.replace('?',' ') try: sen = mySent.strip() tokens = a.analyze(sen) #,wakati=True) for j in tokens: if ('\\' not in j.surface) and ('/' not in j.surface) and ( '@' not in j.surface): newsen = newsen + cutwords(j.surface) + ' ' newsen = re.sub('[@=#¥~^<。$;+⇒•()()_→【】{}|...”「、>:」!,."...%*-]', '', newsen) newsen = newsen.replace('?', ' ').replace('[', ' ').replace(']', ' ') newsen = re.sub(r'[0-9]+', ' ', newsen) tdesc.append(newsen) print(newsen) except: print('abnormal!') text = tweet1.copy() char_filters = [UnicodeNormalizeCharFilter()]
from janome.tokenizer import Tokenizer from janome.analyzer import Analyzer from janome.tokenfilter import * f = open("AbeShinzo.csv", "r", encoding='utf-8') #wadamasamune.csv AbeShinzo.text tweet = f.read() token_filters = [POSKeepFilter('名詞'), TokenCountFilter()] a = Analyzer(token_filters=token_filters) for k, v in a.analyze(tweet): if (v >= 1): print("%s: %d" % (k, v))
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer from janome.analyzer import Analyzer from janome.charfilter import * from janome.tokenfilter import * import logging logging.basicConfig(level='INFO') print(u'Analyzer example:') text = u'蛇の目はPure Pythonな形態素解析器です。' char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(u'蛇の目', u'janome')] tokenizer = Tokenizer() token_filters = [CompoundNounFilter(), LowerCaseFilter()] a = Analyzer(char_filters, tokenizer, token_filters) for token in a.analyze(text): print(token) print('') print(u'Analyzer example: Count nouns with POSKeepFilter and TokenCountFilter') text = u'すもももももももものうち' token_filters = [POSKeepFilter(u'名詞'), TokenCountFilter()] a = Analyzer(token_filters=token_filters) for k, v in a.analyze(text): print('%s: %d' % (k, v))
from janome.tokenfilter import * from janome.charfilter import * import pickle with open("the_night_of_the_milky_way_train.pickle", mode="rb") as f: milky = pickle.load(f) t = Tokenizer() for token in t.tokenize(milky): print(token) part_of_speech = '名詞' char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r"[IiⅠi?.*/~=()〝 <>::《°!!!?()-]+", "") ] token_filters = [ POSKeepFilter([part_of_speech]), POSStopFilter([]), LowerCaseFilter() ] analyzer = Analyzer(char_filters, t, token_filters) noun_list = [token.surface for token in analyzer.analyze(milky)] print(noun_list) print(len(noun_list)) # ['みなさん', 'ふう', '川', '乳', 'あと', 'ぼんやり', 'もの', 'ほんとう', '何', '承知', '先生', '黒板', '星座', '図', '上', '下', '銀河', '帯', 'よう', 'ところ', 'みんな', '問', 'カムパネルラ', '手', 'それ', '四', '五', '人', '手', 'ジョバンニ', '手', 'あれ', 'みんな', '星', 'いつか', '雑誌', 'の', 'このごろ', 'ジョバンニ', '毎日', '教室', '本', 'ひま', '本' ...] # 5895
print(sentences[i]) # 転職 Advent Calendar 2016 - Qiitaの14日目となります。 少しポエムも含みます。 # 今年11月にSIerからWebサービスの会社へ転職しました。 # 形態素解析器を作る analyzer = Analyzer( [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter( r'[(\)「」、。]', ' ')], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter( 'base_form')] # 名詞・形容詞・副詞・動詞の原型のみ ) # 抽出された単語をスペースで連結 # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。 corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] for i in range(2): print(corpus[i]) # 転職 Advent Calendar 2016 - Qiita 14 日 目 なる 少し ポエム 含む。 # 今年 11 月 SIer Web サービス 会社 転職 する。 # 連結したcorpusを再度tinysegmenterでトークナイズさせる parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を2文抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # スペースも1単語として認識されるため、ストップワードにすることで除外する summary = summarizer(document=parser.document, sentences_count=2)
# 単語に対する処理のまとめ token_filters = [ NumericReplaceFilter(), # 名詞中の漢数字を含む数字を0に置換 CompoundNounFilter(), # 名詞が連続する場合は複合名詞にする POSKeepFilter(['名詞', '動詞', '形容詞', '副詞']), # 名詞・動詞・形容詞・副詞のみを取得する LowerCaseFilter(), # 英字は小文字にする OneCharacterReplaceFilter() # 一文字しか無いひらがなとカタガナと英数字は削除 ] analyzer = Analyzer(char_filters, tokenizer, token_filters) tokens_list = [] raw_texts = [] for text in texts: # 文を分割し、単語をそれぞれ正規化する text_ = [token.base_form for token in analyzer.analyze(text)] if len(text_) > 0: tokens_list.append( [token.base_form for token in analyzer.analyze(text)]) raw_texts.append(text) # 正規化された際に一文字もない文の削除後の元テキストデータ raw_texts = [text_ + '\n' for text_ in raw_texts] with open(data_dir_path.joinpath(file_name.replace('.txt', '_cut.txt')), 'w', encoding='utf-8') as file: file.writelines(raw_texts) # 単語リストの作成 words = [] for text in tokens_list:
# と 助詞,並立助詞,*,*,*,*,と,ト,ト # パイソン 名詞,一般,*,*,*,*,パイソン,*,* # </ 名詞,サ変接続,*,*,*,*,</,*,* # div 名詞,一般,*,*,*,*,div,*,* # > 名詞,サ変接続,*,*,*,*,>,*,* char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('<.*?>', '')] token_filters = [POSKeepFilter(['名詞']), LowerCaseFilter(), ExtractAttributeFilter('surface')] a = Analyzer(char_filters=char_filters, token_filters=token_filters) for token in a.analyze(s): print(token) # python # python # パイソン # パイソン s = '自然言語処理による日本国憲法の形態素解析' for token in t.tokenize(s): print(token) # 自然 名詞,形容動詞語幹,*,*,*,*,自然,シゼン,シゼン # 言語 名詞,一般,*,*,*,*,言語,ゲンゴ,ゲンゴ # 処理 名詞,サ変接続,*,*,*,*,処理,ショリ,ショリ # による 助詞,格助詞,連語,*,*,*,による,ニヨル,ニヨル # 日本国 名詞,固有名詞,地域,国,*,*,日本国,ニッポンコク,ニッポンコク
from janome.tokenizer import Tokenizer from janome.analyzer import Analyzer from janome.charfilter import * from janome.tokenfilter import * #janome初期化 #フィルタの初期化 tokenizer = Tokenizer() char_filters = [UnicodeNormalizeCharFilter()] token_filters = [CompoundNounFilter(), POSStopFilter("助詞"), LowerCaseFilter()] a = Analyzer(char_filters, tokenizer, token_filters) #ツイッターデータ処理 for token in a.analyze("今日はとても暑いけれども虹がとても綺麗である."): print(token) if token.surface == "虹": print("虹をはっけん")
class TestFilters(unittest.TestCase): def setUp(self): #aliases = get_word_aliases() char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('&[^&]+;', '') ] tokenizer = Tokenizer(mmap=True) token_filters = [ FootballCompoundNounFilter(), FootballNounFilter(), POSKeepFilter('名詞') ] self.analyzer = Analyzer(char_filters, tokenizer, token_filters) def test_CompoundNounFilter1(self): # test for "本田圭佑" content = "日本代表・本田圭佑はW杯で最もスタイリッシュ" tokens = self.analyze(content) self.assertTrue('本田圭佑' in tokens) def test_CompoundNounFilter2(self): # test for "さん" content = "【朗報】 本田圭佑さん、2試合で1ゴール1アシスト" tokens = self.analyze(content) self.assertTrue('本田圭佑' in tokens) def test_CompoundNounFilter3(self): # test for "ら" content = '親交の深いMF香川真司らの得点で勝利を挙げた日本の戦いぶりには、試合はFIFAのジャンニ・インファンティノ会長らも座るVIP席で観戦。' tokens = self.analyze(content) self.assertTrue('香川真司' in tokens) self.assertFalse('香川真司ら' in tokens) self.assertTrue('ジャンニ・インファンティノ' in tokens) self.assertFalse('ジャンニ・インファンティノ会長ら' in tokens) def test_CompoundNounFilter4(self): # test for "ら" content = 'コーフェルト監督自らが直接口説き落とした大迫' tokens = self.analyze(content) self.assertTrue('コーフェルト監督' in tokens) def test_CompoundNounFilter5(self): # test for "ら" content = 'ジェノアの監督ダビデ・バッラルディーニは憮然とした表情で地元紙のインタビューに応えた“超上から目線”' tokens = self.analyze(content) self.assertTrue('ダビデ・バッラルディーニ' in tokens) def test_CompoundNounFilter6(self): # test for "ら" content = '南仏エクス・オン・プロバンス生まれのミシェルと、生粋のブルターニュ人' tokens = self.analyze(content) self.assertTrue('エクス・オン・プロバンス' in tokens) def analyze(self, content, debug=False): tokens = list(self.analyzer.analyze(content)) retval = {} for token in tokens: if debug: print(token) retval[token.base_form] = token return retval
Articles.feed_id == Feeds.id, ArticleContents.token_extracted == False, ArticleContents.extracted_content != None, Feeds.language == 'ja').order_by( ArticleContents.id) #.limit(100).offset(400) dbtokens = {} count = 1 total = article_contents.count() print('There are %d contents to process' % total) try: for article_content in article_contents: print(' %7d / %d, %s' % (count, total, article_content.article_hash)) content = article_content.extracted_content words = [] tokens = list(analyzer.analyze(content)) for token in tokens: dbtoken = None if token.base_form in dbtokens: dbtoken = dbtokens[token.base_form] elif not args.renew: dbtoken = session.query(Tokens).filter( Tokens.base_form == token.base_form).one_or_none() if dbtoken: dbtoken.occurrence_count = dbtoken.occurrence_count + 1 else: pos = token.part_of_speech.split(",") dbtoken = Tokens() dbtoken.base_form = token.base_form dbtoken.part_of_speech1 = None if pos[0] is '*' else pos[0]
def separate(path, review_type, noun=True, verb=True, adj=True, adv=True): ''' pathの文章を分割し、./review_{review_type}_separated内に書き込む ''' data_dir_path = Path('.') corpus_dir_path = Path('.') title = path[-14:-4] with open(data_dir_path.joinpath(path), 'r', encoding='utf-8') as file: texts = file.readlines() texts = [text_.replace('\n', '') for text_ in texts] # janomeのAnalyzerを使うことで、文の分割と単語の正規化をまとめて行うことができる # 文に対する処理のまとめ char_filters = [ UnicodeNormalizeCharFilter(), # UnicodeをNFKC(デフォルト)で正規化 RegexReplaceCharFilter('\(', ''), RegexReplaceCharFilter('\)', ''), RegexReplaceCharFilter('\!', ''), RegexReplaceCharFilter('\!', ''), RegexReplaceCharFilter('\?', ''), RegexReplaceCharFilter('\.', ''), RegexReplaceCharFilter('\^', ''), RegexReplaceCharFilter('\-', ''), RegexReplaceCharFilter('\.', ''), ] # 単語に分割 tokenizer = Tokenizer() # # 名詞中の数(漢数字を含む)を全て0に置き換えるTokenFilterの実装 # class NumericReplaceFilter(TokenFilter): def apply(self, tokens): for token in tokens: parts = token.part_of_speech.split(',') if parts[0] == '名詞' and parts[1] == '数': token.surface = '0' token.base_form = '0' token.reading = 'ゼロ' token.phonetic = 'ゼロ' yield token # # ひらがな・カタガナ・英数字の一文字しか無い単語は削除 # class OneCharacterReplaceFilter(TokenFilter): def apply(self, tokens): for token in tokens: # 上記のルールの一文字制限で引っかかった場合、その単語を無視 if re.match('^[あ-んア-ンa-zA-Z0-9ー]$', token.surface): continue yield token #引数で指定した品詞のみフィルターする filter_list = [] if noun: filter_list.append('名詞') if verb: filter_list.append('動詞') if adj: filter_list.append('形容詞') if adv: filter_list.append('副詞') # 単語に対する処理のまとめ token_filters = [ NumericReplaceFilter(), # 名詞中の漢数字を含む数字を0に置換 CompoundNounFilter(), # 名詞が連続する場合は複合名詞にする POSKeepFilter(filter_list), # 名詞・動詞・形容詞・副詞のみを取得する POSStopFilter('記号'), # 記号は取り除く LowerCaseFilter(), # 英字は小文字にする OneCharacterReplaceFilter() # 一文字しか無いひらがなとカタガナと英数字は削除 ] analyzer = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) tokens_list = [] raw_texts = [] for text in texts: # 文を分割し、単語をそれぞれ正規化する text_ = [token.base_form for token in analyzer.analyze(text)] if len(text_) > 0: tokens_list.append( [token.base_form for token in analyzer.analyze(text)]) raw_texts.append(text) # 単語リストの作成 words = [] for text in tokens_list: words.extend([word + '\n' for word in text if word != '']) separated_path = f"./review_{review_type}_separated/{title}_separated.txt" with open(corpus_dir_path.joinpath(separated_path), 'w', encoding='utf-8') as file: file.writelines(words)
from janome.analyzer import Analyzer from janome.tokenfilter import POSKeepFilter, TokenCountFilter text = 'すもももももももものうち' token_filters = [POSKeepFilter('名詞'), TokenCountFilter()] a = Analyzer(token_filters=token_filters) for word, count in a.analyze(text): print(f'{word}: {count}')
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer from janome.analyzer import Analyzer from janome.tokenfilter import * text = u'すもももももももものうち' tokenizer = Tokenizer(mmap=True) token_filters = [POSKeepFilter('名詞'), TokenCountFilter(att='base_form')] a = Analyzer(tokenizer=tokenizer, token_filters=token_filters) for k, v in a.analyze(text): print('%s: %d' % (k, v))
import sys from janome.analyzer import Analyzer from janome.tokenfilter import * msg = sys.argv[1] filters = [POSKeepFilter(['名詞', '形容詞'])] analyzer = Analyzer(token_filters=filters) for t in analyzer.analyze(msg): print( f'phonetic = {t.phonetic}, reading = {t.reading}, surface = {t.surface}, part_of_speech = {t.part_of_speech}, base_form = {t.base_form}, infl_form = {t.infl_form}, infl_type = {t.infl_type}, node_type = {t.node_type}' )
CompoundNounFilter(), POSKeepFilter(['名詞']), PartsOfSpeechFilter(['一般', '複合', '固有名詞']), LowerCaseFilter(), ExtractAttributeFilter('surface'), OneCharTokenFilter(), ] tokenizer = Tokenizer("userdic.csv", udic_enc="utf8") analyzer = Analyzer(char_filters, tokenizer, token_filters) with open('input.csv', 'r') as f: """ CSVファイルを読み込み、形態素に分解し名詞のみを抽出する """ reader = csv.reader(f) for row in reader: text = row[0] nouns = [surface for surface in analyzer.analyze(text)] all_nouns = np.hstack((all_nouns, nouns)) """ 名詞ごとの個数を計算してCSVに出力する """ reshaped = np.vstack((all_nouns, np.ones(all_nouns.shape[0]))).transpose() df = pd.DataFrame({'name': all_nouns, 'count': np.ones(all_nouns.shape[0])}) grouped = df.groupby('name') grouped.sum().sort_values(['count'], ascending=False).to_csv("janome_result.csv")
tokenizer = Tokenizer() token_filters = [NumericReplaceFilter(), CompoundNounFilter(), POSKeepFilter(['名詞', '動詞', '形容詞', '副詞']), LowerCaseFilter(), OneCharacterReplaceFilter() ] analyzer = Analyzer(char_filters, tokenizer, token_filters) tokens_list = [] raw_texts = [] for text in texts: text_ = [token.base_form for token in analyzer.analyze(text)] if len(text_) > 0: tokens_list.append([token.base_form for token in analyzer.analyze(text)]) raw_texts.append(text) raw_texts = [text_ + '\n' for text_ in raw_texts] with open(data_dir_path.joinpath(file_name.replace('.txt', '_cut.txt')), 'w',encoding='utf-8') as file: file.writelines(raw_texts) words = [] for text in tokens_list: words.extend([word + '\n' for word in text if word != '']) with open(corpus_dir_path.joinpath(file_name.replace('.txt', '_word_list.txt')), 'w', encoding='utf-8') as file: file.writelines(words)
# _ret = None # for token in tokens: # if _ret: # if token.part_of_speech.startswith(u'名詞') and _ret.part_of_speech.startswith(u'名詞'): # _ret.surface += token.surface # _ret.part_of_speech = u'名詞,複合,*,*' # _ret.base_form += token.base_form # _ret.reading += token.reading # _ret.phonetic += token.phonetic # else: # ret = _ret # _ret = token # yield ret # else: # _ret = token # if _ret: # yield _ret # CompoundNounFilter(TokenFilter).apply() # token_filters = [CompoundNounFilter()] # text ="多分タピオカ8万粒は食べてきた笑タピオカってなんなん#台湾#台湾旅行#台湾グルメ#九份#十分夜市#台北101#龍山寺#中正紀念堂#謝謝台湾 場所: 台北,台湾" # a = Analyzer(token_filters=token_filters) tokens = b.analyze(text) word_list = [] for token in tokens: word = token.surface word_list.append(word) words_wakati = " ".join(word_list) print(word_list)
def main(): file = open('similarity_grouping_result.csv', 'w') writer = csv.writer(file, lineterminator='\n') writer.writerow(['frequent_word', 'total_count', 'synonym']) df = pd.read_csv('janome_result.csv') keys = df['name'].ravel() char_filters = [UnicodeNormalizeCharFilter()] token_filters = [ POSKeepFilter(['名詞']), PartsOfSpeechFilter(['固有名詞']), ExtractAttributeFilter('surface'), OneCharTokenFilter() ] analyzer = Analyzer(char_filters, Tokenizer("userdic.csv", udic_enc="utf8"), token_filters) print("Start. target keys: {}".format(len(keys))) # 全ての単語を類義語のデータベースに登録 for index, row in df.sort_values('count', ascending=False).iterrows(): db.add(normalize('NFKC', row['name'])) for index, row in df.iterrows(): name = row['name'] if name not in keys: continue # 表記揺れと類義語を探す passed_list = [] set_similarity_strings(passed_list, name) # 抜け漏れを少なくするため、部分一致で含まれるものを抽出する for word in copy.copy(passed_list): if len(word) <= 3: continue for key in keys: if key.find(word) >= 0 and key not in passed_list: passed_list.extend([key]) total_count = df[df['name'].isin(passed_list)]['count'].sum() combined_passed_name = ':'.join(passed_list) # 再頻出の単語を抽出する keywords = [ surface for surface in analyzer.analyze(combined_passed_name) ] frequent_word = {'key': 'No Key', 'count': 0} for key in keywords: if len(word) < 3: continue count = combined_passed_name.count(key) if count > frequent_word['count']: frequent_word['key'] = key frequent_word['count'] = count # print("Count: {}, Names: {}".format(total_count, combined_passed_name)) writer.writerow( [frequent_word['key'], total_count, combined_passed_name]) keys = np.delete(keys, np.where(np.isin(keys, passed_list) == True)) # print("Grouping keys... size: {}, keys: {}. Unpassed keys... size: {}".format(len(passed_list), passed_list, len(keys))) file.close() print("End")
import itertools from igraph import * min_freq = 4 filename = './sample.txt' data = [l.replace('。', '。\n') for l in open(filename, 'r', encoding='utf-8')] sentenses = [re.sub(' ', '', u) for u in data if len(u) != 0] print(len(sentenses)) tokenizer = Tokenizer() char_filters = [UnicodeNormalizeCharFilter()] token_filters = [CompoundNounFilter() ] #, POSStopFilter(['記号']), LowerCaseFilter()] analyzer = Analyzer(char_filters, tokenizer, token_filters) dt = [analyzer.analyze(s) for s in sentenses] print(len(dt)) noums = [[item.surface for item in t if '名詞' in item.part_of_speech] for t in dt] pairlist = [ list(itertools.combinations(ns, 2)) for ns in noums if len(ns) >= 2 ] all_pairs = [] for u in pairlist: all_pairs.extend(u) pcount = Counter(all_pairs) print('pair frequency', sorted(pcount.items(), key=lambda x: x[1], reverse=True)[:30])
nlp = NLP() # print(nlp.analyze('日本経済新聞社によると、関ジャニ∞の渋谷君が千代田区のスペイン村で豪遊した帰りに、六本木ヒルズの無国籍レストランで地中海料理かフランス料理か日本料理か迷ったらしいんだけど。')) while True: text = input('> ') if not text: break print() char_filters = [UnicodeNormalizeCharFilter()] tokenizer = Tokenizer() tokens = tokenizer.tokenize(text) print('形態素解析') for token in tokens: print(token) token_filters = [CompoundNounFilter(), POSStopFilter(['記号']), LowerCaseFilter()] a = Analyzer(char_filters, tokenizer, token_filters) print('\n複合語処理後') for token in a.analyze(text): print(token) tokens = [token for token in a.analyze(text)] scores = [] for token in tokens: label, score = nlp.predict_emotion(token) if len(label) != 6: continue scores.append((label, score)) scores = [np.array(item[1]) for item in scores] score = np.zeros(len(scores[0])) for s in scores: score += s score /= len(scores) print(score) chart(label, score)
def preprocess(all_df): # すべて欠損しているカラムは削除 allnot_col = list( all_df.isnull().sum()[all_df.isnull().sum() == 19244].index) all_df = all_df.drop(columns=allnot_col) # 値がすべて同じカラムかつ欠損値を持たないカラムは削除 one_col = list(all_df.nunique()[all_df.nunique() == 1].index) onefull_col = list(all_df[one_col].isnull().sum()[ all_df[one_col].isnull().sum() == 0].index) all_df = all_df.drop(columns=onefull_col) # 重複データ train = all_df[all_df["応募数 合計"].notnull()] test = all_df[all_df["応募数 合計"].isnull()] train['応募数mean'] = train.groupby(["お仕事No."])["応募数 合計"].transform("mean") test["応募数mean"] = np.nan all_df = pd.concat([train, test], ignore_index=True, sort=False) # 全重複数もカラムとして残しておく all_df["all_count"] = all_df.groupby(["お仕事No." ])["お仕事No."].transform("count") train = all_df[all_df["応募数mean"].notnull()] test = all_df[all_df["応募数mean"].isnull()] train = train.drop(columns=["応募数 合計"]) test = test.drop(columns=["応募数 合計"]) train = train.drop_duplicates(subset=["お仕事No."]) all_df = pd.concat([train, test], ignore_index=True, sort=False) # 欠損している数を表すカラムを追加 all_df["NaN_num"] = all_df.isnull().sum(axis=1) # 欠損かどうかを表すカラムを追加 no_df = pd.DataFrame( { "num": all_df.isnull().sum()[all_df.isnull().sum() > 0].values, "type": all_df[all_df.isnull().sum()[ all_df.isnull().sum() > 0].index].dtypes }, index=all_df.isnull().sum()[all_df.isnull().sum() > 0].index) for i in no_df.index: all_df["NaN_" + i] = np.where(all_df[i].isnull(), 1, 0) all_df = all_df.drop(columns=["(派遣先)概要 事業内容", "NaN_応募数mean"]) # 男女比 女は削除 all_df = all_df.drop(columns=["(派遣先)配属先部署 男女比 女", "NaN_(派遣先)配属先部署 男女比 女"]) # 欠損値の補完 no_df2 = pd.DataFrame( { "num": all_df.isnull().sum()[all_df.isnull().sum() > 0].values, "type": all_df[all_df.isnull().sum()[ all_df.isnull().sum() > 0].index].dtypes }, index=all_df.isnull().sum()[all_df.isnull().sum() > 0].index) # 欠損している数値カラム no_float_col = list(no_df2[no_df2["type"] != "object"].index) no_float_col.remove("応募数mean") # 欠損しているカテゴリカラム no_obj_col = list(no_df2[no_df2["type"] == "object"].index) # カテゴリ変数と考えられるものはNAで補完 cols = ["(紹介予定)入社後の雇用形態", "勤務地 最寄駅2(駅からの交通手段)", "勤務地 最寄駅1(駅からの交通手段)"] for col in cols: all_df[col] = all_df[col].fillna("NA") # 数値変数として扱いたいものは-9999で補完¶ cols2 = [ "(派遣先)配属先部署 男女比 男", "(派遣先)配属先部署 人数", "勤務地 最寄駅1(分)", "(派遣先)配属先部署 平均年齢", "給与/交通費 給与上限", "勤務地 最寄駅2(分)" ] for col in cols2: all_df[col] = all_df[col].fillna(-9999) # 欠損値をNAで埋める for col in no_obj_col[:-1]: all_df[col] = all_df[col].fillna("NA") # 数値変数をカテゴリ変数に for col in ["フラグオプション選択", "職種コード", "会社概要 業界コード", "仕事の仕方", "勤務地 市区町村コード"]: all_df[col] = all_df[col].astype(str) all_df['掲載期間 開始日'] = pd.to_datetime(all_df['掲載期間 開始日'], format="%Y/%m/%d") all_df['掲載期間 終了日'] = pd.to_datetime(all_df['掲載期間 終了日'], format="%Y/%m/%d") all_df['期間・時間 勤務開始日'] = pd.to_datetime(all_df['期間・時間 勤務開始日'], format="%Y/%m/%d") # 掲載開始日と勤務開始日 all_df['掲載期間 開始日'] = pd.to_datetime(all_df['掲載期間 開始日'], format="%Y/%m/%d") all_df['掲載期間 終了日'] = pd.to_datetime(all_df['掲載期間 終了日'], format="%Y/%m/%d") all_df['期間・時間 勤務開始日'] = pd.to_datetime(all_df['期間・時間 勤務開始日'], format="%Y/%m/%d") all_df["勤務開始-掲載開始"] = (all_df['期間・時間 勤務開始日'] - all_df['掲載期間 開始日']) all_df["勤務開始-掲載開始"] = all_df["勤務開始-掲載開始"].dt.days all_df = all_df.drop(columns=['掲載期間 開始日', "掲載期間 終了日", "期間・時間 勤務開始日"]) # 勤務時間 all_df["workingstart"] = all_df["期間・時間 勤務時間"].str.split("〜", expand=True)[0] all_df["workingend"] = all_df["期間・時間 勤務時間"].str.split( "〜", expand=True)[1].str.split(" ", expand=True)[0] all_df["workingstart"] = pd.to_datetime(all_df['workingstart'], format='%H:%M') all_df["workingend"] = pd.to_datetime(all_df['workingend'], format='%H:%M') all_df["workingtime_m"] = (all_df["workingend"] - all_df["workingstart"]).astype('timedelta64[m]') all_df["workingrest"] = all_df["期間・時間 勤務時間"].str.split( "休憩", expand=True)[1].str.split("分", expand=True)[0].str.split("<BR>", expand=True)[0] all_df["workingrest"] = all_df["workingrest"].apply( lambda x: re.sub(r'\D', '', str(x))) all_df["workingrest"][all_df["workingrest"] == ""] = "0" all_df["workingrest"][all_df["workingrest"] == "1"] = "60" all_df["workingrest"][all_df["workingrest"] == "6090"] = "75" all_df["workingrest"][all_df["workingrest"] == "13301600"] = "0" all_df["workingrest"][all_df["workingrest"] == "10301330"] = "0" all_df["workingrest"] = all_df["workingrest"].apply(int) all_df["productiontime_m"] = (all_df["workingtime_m"] - all_df["workingrest"] ) #.astype('timedelta64[m]') for i in list(all_df.dtypes[all_df.dtypes == "datetime64[ns]"].index): all_df[i] = all_df[i].astype(str) # NaN_期間・時間 備考を追加したので削除 all_df = all_df.drop(columns=["期間・時間 備考"]) #言語処理 token_df = pd.read_pickle("pickles/train_token.pickle") #お仕事名 char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter( r"[!$%&\'()*+,-./:;<=>?@\\^_`{|}~◆▼★②●☆■★【】『』「」、♪≪≫]", " ") ] token_filters = [ CompoundNounFilter(), POSKeepFilter(['名詞']), LowerCaseFilter() ] a = Analyzer(char_filters=char_filters, token_filters=token_filters) length = all_df["応募数mean"].notnull().sum() all_df['お仕事名_token'] = np.nan all_df['お仕事名_token'][:length] = token_df["お仕事名_token"] all_df['お仕事名_token'][length:] = all_df["お仕事名"][length:].apply( lambda x: " ".join([token.surface for token in a.analyze(x)])) #all_df['お仕事名_token'] = token_df["お仕事名_token"] with open("pickles/grid_1.pickle", mode="rb") as ff: model = pickle.load(ff) all_df["お仕事名_pred"] = model.predict(all_df['お仕事名_token'].values) all_df = all_df.drop(columns=['お仕事名_token']) #仕事内容 select_conditions = ['名詞'] tagger = MeCab.Tagger('') tagger.parse('') def wakati_text(text): node = tagger.parseToNode(text) terms = [] while node: term = node.surface pos = node.feature.split(',')[0] if pos in select_conditions: terms.append(term) node = node.next text_result = ' '.join(terms) return text_result length = all_df["応募数mean"].notnull().sum() all_df['仕事内容_token'] = np.nan all_df['仕事内容_token'][:length] = token_df["仕事内容_token"] all_df['仕事内容_token'][length:] = all_df["仕事内容"][length:].apply(wakati_text) #all_df['仕事内容_token'] = token_df["仕事内容_token"] with open("pickles/grid_2.pickle", mode="rb") as ff: model = pickle.load(ff) all_df["仕事内容_pred"] = model.predict(all_df['仕事内容_token']) all_df = all_df.drop(columns=['仕事内容_token']) #お仕事のポイント(仕事PR) char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter( r"[!$%&\'()*+,-./:;<=>?@\\^_`{|}~◆▼★②●☆■★【】『』「」、♪≪≫]", " ") ] token_filters = [ CompoundNounFilter(), POSKeepFilter(['名詞']), LowerCaseFilter() ] a = Analyzer(char_filters=char_filters, token_filters=token_filters) length = all_df["応募数mean"].notnull().sum() print(length) all_df['お仕事のポイント_token'] = np.nan all_df['お仕事のポイント_token'][:length] = token_df["お仕事のポイント_token"] all_df['お仕事のポイント_token'][length:] = all_df["お仕事のポイント(仕事PR)"][ length:].apply( lambda x: " ".join([token.surface for token in a.analyze(x)])) #all_df['お仕事のポイント_token'] = token_df["お仕事のポイント_token"] with open("pickles/grid_3.pickle", mode="rb") as ff: model = pickle.load(ff) all_df["お仕事のポイント_pred"] = model.predict(all_df['お仕事のポイント_token'].values) all_df = all_df.drop(columns=['お仕事のポイント_token']) # (派遣先)配属先部署 char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter( r"[!$%&\'()*+,-./:;<=>?@\\^_`{|}~◆▼★②●☆■★【】『』「」、♪≪≫]", " ") ] token_filters = [POSKeepFilter(['名詞']), LowerCaseFilter()] a = Analyzer(char_filters=char_filters, token_filters=token_filters) length = all_df["応募数mean"].notnull().sum() all_df['(派遣先)配属先部署_token'] = np.nan all_df['(派遣先)配属先部署_token'][:length] = token_df["(派遣先)配属先部署_token"] all_df['(派遣先)配属先部署_token'][length:] = all_df["(派遣先)配属先部署"][length:].apply( lambda x: " ".join([token.surface for token in a.analyze(x)])) #all_df['(派遣先)配属先部署_token'] = token_df["(派遣先)配属先部署_token"] with open("pickles/grid_4.pickle", mode="rb") as ff: model = pickle.load(ff) all_df["(派遣先)配属先部署_pred"] = model.predict(all_df['(派遣先)配属先部署_token']) all_df = all_df.drop(columns=['(派遣先)配属先部署_token']) # (派遣先)職場の雰囲気 char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter( r"[!$%&\'()*+,-./:;<=>?@\\^_`{|}~◆▼★②●☆■★【】『』「」、♪≪≫]", " ") ] token_filters = [ CompoundNounFilter(), POSKeepFilter(['名詞']), LowerCaseFilter() ] a = Analyzer(char_filters=char_filters, token_filters=token_filters) length = all_df["応募数mean"].notnull().sum() all_df['(派遣先)職場の雰囲気_token'] = np.nan all_df['(派遣先)職場の雰囲気_token'][:length] = token_df["(派遣先)職場の雰囲気_token"] all_df['(派遣先)職場の雰囲気_token'][length:] = all_df["(派遣先)職場の雰囲気"][ length:].apply( lambda x: " ".join([token.surface for token in a.analyze(x)])) #all_df['(派遣先)職場の雰囲気_token'] = token_df["(派遣先)職場の雰囲気_token"] with open("pickles/grid_5.pickle", mode="rb") as ff: model = pickle.load(ff) all_df["(派遣先)職場の雰囲気_pred"] = model.predict(all_df['(派遣先)職場の雰囲気_token']) with open("pickles/lda_5.pickle", mode="rb") as ff: lda = pickle.load(ff) X_lda = lda.transform(all_df["(派遣先)職場の雰囲気_token"]) all_df["(派遣先)職場の雰囲気_lda"] = X_lda.argmax(axis=1) all_df = all_df.drop(columns=['(派遣先)職場の雰囲気_token']) all_df = all_df.drop(columns=[ "お仕事名", "仕事内容", "お仕事のポイント(仕事PR)", "(派遣先)配属先部署", "(派遣先)職場の雰囲気" ]) # Label Encoding cat_cols = list(all_df.dtypes[all_df.dtypes == "object"].index) for col in cat_cols: le = LabelEncoder() all_df[col] = le.fit_transform(all_df[col].apply(lambda x: str(x))) return all_df
ExtractAttributeFilter("base_form") ] analyzer = Analyzer(char_filters, tokenizer, token_filters) #単語抽出とストップワード stopwords = [] url = "http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt" with urllib.request.urlopen(url) as response: stopwords = [w for w in response.read().decode().split('\r\n') if w != ""] texts_words = {} for k, v in texts.items(): texts_words[k] = [w for w in analyzer.analyze(v)] #辞書 dictionary = gensim.corpora.Dictionary(texts_words.values()) dictionary.filter_extremes(no_below=3, no_above=0.4) #コーパス corpus = [dictionary.doc2bow(words) for words in texts_words.values()] #LDA Model 教師なし lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=4, id2word=dictionary, random_state=1) print('topics: {}'.format(lda.show_topics()))
news_df = article_df[0:].reset_index(drop = True) from janome.tokenizer import Tokenizer from janome.analyzer import Analyzer from janome.charfilter import * import numpy as np t = Tokenizer() char_filters = [UnicodeNormalizeCharFilter()] analyzer = Analyzer(char_filters, t) word_lists = [] words = [] for i, row in news_df.iterrows(): for t in analyzer.analyze(row[0]): #形態素 surf = t.surface words.append([surf]) #基本形 base = t.base_form #品詞 pos = t.part_of_speech #読み reading = t.reading word_df = pd.DataFrame(word_lists, columns = ['単語']) score_result = word_df
# -*- coding: utf-8 -*- from janome.tokenizer import Tokenizer from janome.analyzer import Analyzer from janome.charfilter import * from janome.tokenfilter import * import logging logging.basicConfig(level='INFO') print(u'Analyzer example:') text = u'蛇の目はPure Pythonな形態素解析器です。' char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(u'蛇の目', u'janome') ] tokenizer = Tokenizer() token_filters = [CompoundNounFilter(), LowerCaseFilter()] a = Analyzer(char_filters, tokenizer, token_filters) for token in a.analyze(text): print(token) print('') print(u'Analyzer example: Count nouns with POSKeepFilter and TokenCountFilter') text = u'すもももももももものうち' token_filters = [POSKeepFilter(u'名詞'), TokenCountFilter()] a = Analyzer(token_filters=token_filters) for k, v in a.analyze(text): print('%s: %d' % (k, v))
makefile = input('> ') #自分で名前を記入したtxtデータに加工データがある。 gettwitterdata(keyword2, dfile) re_1(dfile, makefile) print("おそらく書き込みは完了") print("データを解析中だと考える。") #加工データを変数に入れて、その変数を単語頻度のjanomeに投げるやつ。 #変数はsとする #makefile2は任意に入力したtextデータタイトルをテキストデータとしての文字列に変更したもの makefile2 = "" + makefile + ".txt" f = open(makefile2) s = f.read() f.close() a = Analyzer(token_filters=[POSKeepFilter(['名詞']), TokenCountFilter()]) g_count = a.analyze(s) #リスト化させる。 l_count = list(a.analyze(s)) #print(type(g_count)) print(type(l_count)) # <class 'generator'> #全て表記させる。 #for i in g_count: # print(i) print(l_count)
def sep(texts): token_filters = [POSKeepFilter('名詞'), TokenCountFilter()] a = Analyzer(token_filters=token_filters) return a.analyze(texts)
RegexReplaceCharFilter('<.*?>', '')] # mainly keep language which is adjective(= "形容詞" in japanese) token_filters = [POSKeepFilter(["形容詞"]), LowerCaseFilter(), ExtractAttributeFilter('surface')] a = Analyzer(char_filters=char_filters, token_filters=token_filters) # for example) split data by result of final(last) interview copyp=df01New[df01New["PF_L"]==2].reset_index() # pass copyn=df01New[df01New["PF_L"]==1].reset_index() # resign copyf=df01New[df01New["PF_L"]==0].reset_index() # fail # pass = result of final(last) interview resultp=[] for i in range(copyp.shape[0]): for x in a.analyze(copyp['value'][i]): resultp.append(x) # resign = result of final(last) interview resultn=[] for i in range(copyn.shape[0]): for x in a.analyze(copyn["value"][i]): resultn.append(x) # fail = result of final(last) interview resultf=[] for i in range(copyf.shape[0]): for x in a.analyze(copyf["value"][i]): resultf.append(x) # make dictionary from collections import Counter
from janome.analyzer import Analyzer from janome.charfilter import * uzai_data = pd.read_csv('../data/uzai_data.csv') uzai_data = uzai_data.rename(columns={'Unnamed: 0': 'index'}) user_dict = UserDictionary('neologd.csv', 'utf8', 'ipadic', sysdic.connections) user_dict.save('neologd') t = Tokenizer(udic='userdic.csv', udic_enc='utf8') char_filters = [UnicodeNormalizeCharFilter()] analyzer = Analyzer(char_filters=char_filters, tokenizer=t) uzai_words_list = [] for i, row in uzai_data.iterrows(): for t in analyzer.analyze(row[4]): surf = t.surface #形態素 base = t.base_form #基本形 pos = t.part_of_speech #品詞 reading = t.reading #読み phonetic = t.phonetic #振り仮名 uzai_words_list.append([i, surf, base, pos, reading, phonetic]) uzai_words_list = pd.DataFrame( uzai_words_list, columns=['index', '単語', '基本形', '品詞', '読み', '振り仮名']) uzai_morpheme_data = pd.merge(uzai_data, uzai_words_list, how='left', on='index')