예제 #1
0
    print("おそらく書き込みは完了")
    """
    print("データを解析中だと考える。")

    #加工データを変数に入れて、その変数を単語頻度のjanomeに投げるやつ。
    #変数はsとする
    #makefile2は任意に入力したtextデータタイトルをテキストデータとしての文字列に変更したもの
    makefile = "パリ"
    makefile2 = ""+ makefile + ".txt"
    f = open(makefile2)
    s = f.read()
    f.close()

    a = Analyzer(token_filters=[POSKeepFilter(['名詞']), TokenCountFilter()])

    g_count = a.analyze(s)
    #リスト化させる。
    l_count = list(a.analyze(s))
    #print(type(g_count))
    #print(type(l_count))
    # <class 'generator'>
    #全て表記させる。
    #for i in g_count:
     #   print(i)

    """
    
    print("sort前")
    print(l_count)
    
    
예제 #2
0
if __name__ == '__main__':
    f_name = 'full_conv.txt'
    path = 'data/'+f_name
    texts = [l.strip() for l in open(path, 'r', encoding='utf8') if l!='\n']

    flag = sys.argv[1]
    print(flag)
    if flag == 'janome':
        # tokenizer = Tokenizer(mmap=True)
        tokenizer = Tokenizer()
        char_filters = [UnicodeNormalizeCharFilter()]
        token_filters = [LowerCaseFilter(), ExtractAttributeFilter(att='surface')]
        analyzer = Analyzer(char_filters, tokenizer, token_filters)
        comp = []
        for l in texts:
            sn = [token for token in analyzer.analyze(l)]
            sn = ['<start>'] + sn + ['<end>']
            comp.append(sn)
        del analyzer
        del token_filters
        del char_filters
        del tokenizer
    else:
        mpath = 'models/sentensepice'
        template = '--input=%s --model_prefix=%s --vocab_size=8000'
        spm.SentencePieceTrainer.train(template%(path, mpath))
        sp = spm.SentencePieceProcessor()
        sp.load(mpath+'.model')

        comp = []
        for l in texts:
예제 #3
0
text = re.sub(r'([.*?])|(《.*?》)', '', text)
# 神の存在、及び人間の霊魂と肉体との区別を論証する、第一哲学についての省察 ...

# pip install Janome
from janome.analyzer import Analyzer
from janome.charfilter import UnicodeNormalizeCharFilter
from janome.tokenizer import Tokenizer
from janome.tokenfilter import POSKeepFilter, ExtractAttributeFilter

analyzer = Analyzer(
    [UnicodeNormalizeCharFilter()],
    Tokenizer(),
    [POSKeepFilter(['名詞', '動詞', '形容詞', '副詞']), ExtractAttributeFilter('base_form')]
)

tokens = [token for token in analyzer.analyze(text)]
# ['神', '存在', '人間', '霊魂', '肉体', ... ]

with open('./input.txt', 'w') as f:
    f.write(' '.join(tokens))

    with open('./vectors.txt', 'r') as original, open('./gensim_vectors.txt', 'w') as transformed:
    vocab_count = vectors.shape[0]  # 単語数
    size = vectors.shape[1]  # 次元数
    
    transformed.write(f'{vocab_count} {size}\n')
    transformed.write(original.read())  # 2行目以降はそのまま出力

from gensim.models import KeyedVectors

glove_vectors = KeyedVectors.load_word2vec_format('./gensim_vectors.txt', binary=False)
예제 #4
0
###############################ETL#################################
text = tweet.copy()
char_filters = [UnicodeNormalizeCharFilter()]
tokenizer = Tokenizer()
token_filters = [CompoundNounFilter(),
                 LowerCaseFilter()]  #POSKeepFilter(['名詞','形容詞']),
a = Analyzer(char_filters, tokenizer, token_filters)
tdesc = []
for i in range(len(text)):
    newsen = ''
    #mySent = re.sub('[?•()()_→【】|...”「、>:」!,."...%*-]', ' ', text[i])
    mySent = text[i]
    #mySent = mySent.replace('?',' ')
    try:
        sen = mySent.strip()
        tokens = a.analyze(sen)  #,wakati=True)
        for j in tokens:
            if ('\\' not in j.surface) and ('/' not in j.surface) and (
                    '@' not in j.surface):
                newsen = newsen + cutwords(j.surface) + ' '
        newsen = re.sub('[@=#¥~^<。$;+⇒•()()_→【】{}|...”「、>:」!,."...%*-]', '',
                        newsen)
        newsen = newsen.replace('?', ' ').replace('[', ' ').replace(']', ' ')
        newsen = re.sub(r'[0-9]+', ' ', newsen)
        tdesc.append(newsen)
        print(newsen)
    except:
        print('abnormal!')

text = tweet1.copy()
char_filters = [UnicodeNormalizeCharFilter()]
예제 #5
0
파일: kaiseki.py 프로젝트: J14005/sotuken
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.tokenfilter import *

f = open("AbeShinzo.csv", "r",
         encoding='utf-8')  #wadamasamune.csv  AbeShinzo.text
tweet = f.read()
token_filters = [POSKeepFilter('名詞'), TokenCountFilter()]
a = Analyzer(token_filters=token_filters)
for k, v in a.analyze(tweet):
    if (v >= 1):
        print("%s: %d" % (k, v))
예제 #6
0
# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *

import logging
logging.basicConfig(level='INFO')

print(u'Analyzer example:')
text = u'蛇の目はPure Pythonな形態素解析器です。'
char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(u'蛇の目', u'janome')]
tokenizer = Tokenizer()
token_filters = [CompoundNounFilter(), LowerCaseFilter()]
a = Analyzer(char_filters, tokenizer, token_filters)
for token in a.analyze(text):
    print(token)

print('')
print(u'Analyzer example: Count nouns with POSKeepFilter and TokenCountFilter')
text = u'すもももももももものうち'
token_filters = [POSKeepFilter(u'名詞'), TokenCountFilter()]
a = Analyzer(token_filters=token_filters)
for k, v in a.analyze(text):
    print('%s: %d' % (k, v))
from janome.tokenfilter import *
from janome.charfilter import *
import pickle

with open("the_night_of_the_milky_way_train.pickle", mode="rb") as f:
    milky = pickle.load(f)

t = Tokenizer()
for token in t.tokenize(milky):
    print(token)

part_of_speech = '名詞'

char_filters = [
    UnicodeNormalizeCharFilter(),
    RegexReplaceCharFilter(r"[IiⅠi?.*/~=()〝 <>::《°!!!?()-]+", "")
]
token_filters = [
    POSKeepFilter([part_of_speech]),
    POSStopFilter([]),
    LowerCaseFilter()
]
analyzer = Analyzer(char_filters, t, token_filters)

noun_list = [token.surface for token in analyzer.analyze(milky)]

print(noun_list)
print(len(noun_list))
# ['みなさん', 'ふう', '川', '乳', 'あと', 'ぼんやり', 'もの', 'ほんとう', '何', '承知', '先生', '黒板', '星座', '図', '上', '下', '銀河', '帯', 'よう', 'ところ', 'みんな', '問', 'カムパネルラ', '手', 'それ', '四', '五', '人', '手', 'ジョバンニ', '手', 'あれ', 'みんな', '星', 'いつか', '雑誌', 'の', 'このごろ', 'ジョバンニ', '毎日', '教室', '本', 'ひま', '本' ...]
# 5895
예제 #8
0
    print(sentences[i])
# 転職 Advent Calendar 2016 - Qiitaの14日目となります。 少しポエムも含みます。
# 今年11月にSIerからWebサービスの会社へ転職しました。

# 形態素解析器を作る
analyzer = Analyzer(
    [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(
        r'[(\)「」、。]', ' ')],  # ()「」、。は全てスペースに置き換える
    JanomeTokenizer(),
    [POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter(
        'base_form')]  # 名詞・形容詞・副詞・動詞の原型のみ
)

# 抽出された単語をスペースで連結
# 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。
corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences]
for i in range(2):
    print(corpus[i])
# 転職 Advent Calendar 2016 - Qiita 14 日 目 なる 少し ポエム 含む。
# 今年 11 月 SIer Web サービス 会社 転職 する。


# 連結したcorpusを再度tinysegmenterでトークナイズさせる
parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese'))

# LexRankで要約を2文抽出
summarizer = LexRankSummarizer()
summarizer.stop_words = [' ']  # スペースも1単語として認識されるため、ストップワードにすることで除外する

summary = summarizer(document=parser.document, sentences_count=2)
예제 #9
0
# 単語に対する処理のまとめ
token_filters = [
    NumericReplaceFilter(),  # 名詞中の漢数字を含む数字を0に置換
    CompoundNounFilter(),  # 名詞が連続する場合は複合名詞にする
    POSKeepFilter(['名詞', '動詞', '形容詞', '副詞']),  # 名詞・動詞・形容詞・副詞のみを取得する
    LowerCaseFilter(),  # 英字は小文字にする
    OneCharacterReplaceFilter()  # 一文字しか無いひらがなとカタガナと英数字は削除
]

analyzer = Analyzer(char_filters, tokenizer, token_filters)

tokens_list = []
raw_texts = []
for text in texts:
    # 文を分割し、単語をそれぞれ正規化する
    text_ = [token.base_form for token in analyzer.analyze(text)]
    if len(text_) > 0:
        tokens_list.append(
            [token.base_form for token in analyzer.analyze(text)])
        raw_texts.append(text)

# 正規化された際に一文字もない文の削除後の元テキストデータ
raw_texts = [text_ + '\n' for text_ in raw_texts]
with open(data_dir_path.joinpath(file_name.replace('.txt', '_cut.txt')),
          'w',
          encoding='utf-8') as file:
    file.writelines(raw_texts)

# 単語リストの作成
words = []
for text in tokens_list:
예제 #10
0
# と	助詞,並立助詞,*,*,*,*,と,ト,ト
# パイソン	名詞,一般,*,*,*,*,パイソン,*,*
# </	名詞,サ変接続,*,*,*,*,</,*,*
# div	名詞,一般,*,*,*,*,div,*,*
# >	名詞,サ変接続,*,*,*,*,>,*,*

char_filters = [UnicodeNormalizeCharFilter(),
                RegexReplaceCharFilter('<.*?>', '')]

token_filters = [POSKeepFilter(['名詞']),
                 LowerCaseFilter(),
                 ExtractAttributeFilter('surface')]

a = Analyzer(char_filters=char_filters, token_filters=token_filters)

for token in a.analyze(s):
    print(token)
# python
# python
# パイソン
# パイソン

s = '自然言語処理による日本国憲法の形態素解析'

for token in t.tokenize(s):
    print(token)
# 自然	名詞,形容動詞語幹,*,*,*,*,自然,シゼン,シゼン
# 言語	名詞,一般,*,*,*,*,言語,ゲンゴ,ゲンゴ
# 処理	名詞,サ変接続,*,*,*,*,処理,ショリ,ショリ
# による	助詞,格助詞,連語,*,*,*,による,ニヨル,ニヨル
# 日本国	名詞,固有名詞,地域,国,*,*,日本国,ニッポンコク,ニッポンコク
예제 #11
0
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *

#janome初期化
#フィルタの初期化
tokenizer = Tokenizer()
char_filters = [UnicodeNormalizeCharFilter()]
token_filters = [CompoundNounFilter(), POSStopFilter("助詞"), LowerCaseFilter()]
a = Analyzer(char_filters, tokenizer, token_filters)

#ツイッターデータ処理

for token in a.analyze("今日はとても暑いけれども虹がとても綺麗である."):
    print(token)
    if token.surface == "虹":
        print("虹をはっけん")
예제 #12
0
class TestFilters(unittest.TestCase):
    def setUp(self):
        #aliases = get_word_aliases()
        char_filters = [
            UnicodeNormalizeCharFilter(),
            RegexReplaceCharFilter('&[^&]+;', '')
        ]
        tokenizer = Tokenizer(mmap=True)
        token_filters = [
            FootballCompoundNounFilter(),
            FootballNounFilter(),
            POSKeepFilter('名詞')
        ]
        self.analyzer = Analyzer(char_filters, tokenizer, token_filters)

    def test_CompoundNounFilter1(self):
        # test for "本田圭佑"
        content = "日本代表・本田圭佑はW杯で最もスタイリッシュ"

        tokens = self.analyze(content)
        self.assertTrue('本田圭佑' in tokens)

    def test_CompoundNounFilter2(self):
        # test for "さん"
        content = "【朗報】 本田圭佑さん、2試合で1ゴール1アシスト"

        tokens = self.analyze(content)
        self.assertTrue('本田圭佑' in tokens)

    def test_CompoundNounFilter3(self):
        # test for "ら"
        content = '親交の深いMF香川真司らの得点で勝利を挙げた日本の戦いぶりには、試合はFIFAのジャンニ・インファンティノ会長らも座るVIP席で観戦。'

        tokens = self.analyze(content)
        self.assertTrue('香川真司' in tokens)
        self.assertFalse('香川真司ら' in tokens)
        self.assertTrue('ジャンニ・インファンティノ' in tokens)
        self.assertFalse('ジャンニ・インファンティノ会長ら' in tokens)

    def test_CompoundNounFilter4(self):
        # test for "ら"
        content = 'コーフェルト監督自らが直接口説き落とした大迫'

        tokens = self.analyze(content)
        self.assertTrue('コーフェルト監督' in tokens)

    def test_CompoundNounFilter5(self):
        # test for "ら"
        content = 'ジェノアの監督ダビデ・バッラルディーニは憮然とした表情で地元紙のインタビューに応えた&ldquo;超上から目線&rdquo;'

        tokens = self.analyze(content)
        self.assertTrue('ダビデ・バッラルディーニ' in tokens)

    def test_CompoundNounFilter6(self):
        # test for "ら"
        content = '南仏エクス・オン・プロバンス生まれのミシェルと、生粋のブルターニュ人'

        tokens = self.analyze(content)
        self.assertTrue('エクス・オン・プロバンス' in tokens)

    def analyze(self, content, debug=False):
        tokens = list(self.analyzer.analyze(content))
        retval = {}
        for token in tokens:
            if debug:
                print(token)
            retval[token.base_form] = token
        return retval
예제 #13
0
    Articles.feed_id == Feeds.id, ArticleContents.token_extracted == False,
    ArticleContents.extracted_content != None,
    Feeds.language == 'ja').order_by(
        ArticleContents.id)  #.limit(100).offset(400)

dbtokens = {}
count = 1
total = article_contents.count()
print('There are %d contents to process' % total)
try:
    for article_content in article_contents:
        print('  %7d / %d, %s' % (count, total, article_content.article_hash))
        content = article_content.extracted_content

        words = []
        tokens = list(analyzer.analyze(content))
        for token in tokens:
            dbtoken = None
            if token.base_form in dbtokens:
                dbtoken = dbtokens[token.base_form]
            elif not args.renew:
                dbtoken = session.query(Tokens).filter(
                    Tokens.base_form == token.base_form).one_or_none()

            if dbtoken:
                dbtoken.occurrence_count = dbtoken.occurrence_count + 1
            else:
                pos = token.part_of_speech.split(",")
                dbtoken = Tokens()
                dbtoken.base_form = token.base_form
                dbtoken.part_of_speech1 = None if pos[0] is '*' else pos[0]
예제 #14
0
def separate(path, review_type, noun=True, verb=True, adj=True, adv=True):
    '''
    pathの文章を分割し、./review_{review_type}_separated内に書き込む
    '''

    data_dir_path = Path('.')
    corpus_dir_path = Path('.')

    title = path[-14:-4]

    with open(data_dir_path.joinpath(path), 'r', encoding='utf-8') as file:
        texts = file.readlines()
    texts = [text_.replace('\n', '') for text_ in texts]

    # janomeのAnalyzerを使うことで、文の分割と単語の正規化をまとめて行うことができる
    # 文に対する処理のまとめ
    char_filters = [
        UnicodeNormalizeCharFilter(),  # UnicodeをNFKC(デフォルト)で正規化
        RegexReplaceCharFilter('\(', ''),
        RegexReplaceCharFilter('\)', ''),
        RegexReplaceCharFilter('\!', ''),
        RegexReplaceCharFilter('\!', ''),
        RegexReplaceCharFilter('\?', ''),
        RegexReplaceCharFilter('\.', ''),
        RegexReplaceCharFilter('\^', ''),
        RegexReplaceCharFilter('\-', ''),
        RegexReplaceCharFilter('\.', ''),
    ]

    # 単語に分割
    tokenizer = Tokenizer()

    #
    # 名詞中の数(漢数字を含む)を全て0に置き換えるTokenFilterの実装
    #
    class NumericReplaceFilter(TokenFilter):
        def apply(self, tokens):
            for token in tokens:
                parts = token.part_of_speech.split(',')
                if parts[0] == '名詞' and parts[1] == '数':
                    token.surface = '0'
                    token.base_form = '0'
                    token.reading = 'ゼロ'
                    token.phonetic = 'ゼロ'
                yield token

    #
    #  ひらがな・カタガナ・英数字の一文字しか無い単語は削除
    #
    class OneCharacterReplaceFilter(TokenFilter):
        def apply(self, tokens):
            for token in tokens:
                # 上記のルールの一文字制限で引っかかった場合、その単語を無視
                if re.match('^[あ-んア-ンa-zA-Z0-9ー]$', token.surface):
                    continue

                yield token

    #引数で指定した品詞のみフィルターする
    filter_list = []
    if noun:
        filter_list.append('名詞')
    if verb:
        filter_list.append('動詞')
    if adj:
        filter_list.append('形容詞')
    if adv:
        filter_list.append('副詞')

    # 単語に対する処理のまとめ
    token_filters = [
        NumericReplaceFilter(),  # 名詞中の漢数字を含む数字を0に置換
        CompoundNounFilter(),  # 名詞が連続する場合は複合名詞にする
        POSKeepFilter(filter_list),  # 名詞・動詞・形容詞・副詞のみを取得する
        POSStopFilter('記号'),  # 記号は取り除く 
        LowerCaseFilter(),  # 英字は小文字にする
        OneCharacterReplaceFilter()  # 一文字しか無いひらがなとカタガナと英数字は削除
    ]

    analyzer = Analyzer(char_filters=char_filters,
                        tokenizer=tokenizer,
                        token_filters=token_filters)

    tokens_list = []
    raw_texts = []
    for text in texts:
        # 文を分割し、単語をそれぞれ正規化する
        text_ = [token.base_form for token in analyzer.analyze(text)]
        if len(text_) > 0:
            tokens_list.append(
                [token.base_form for token in analyzer.analyze(text)])
            raw_texts.append(text)

    # 単語リストの作成
    words = []
    for text in tokens_list:
        words.extend([word + '\n' for word in text if word != ''])

    separated_path = f"./review_{review_type}_separated/{title}_separated.txt"
    with open(corpus_dir_path.joinpath(separated_path), 'w',
              encoding='utf-8') as file:
        file.writelines(words)
예제 #15
0
from janome.analyzer import Analyzer
from janome.tokenfilter import POSKeepFilter, TokenCountFilter

text = 'すもももももももものうち'
token_filters = [POSKeepFilter('名詞'), TokenCountFilter()]
a = Analyzer(token_filters=token_filters)

for word, count in a.analyze(text):
    print(f'{word}: {count}')
예제 #16
0
# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.tokenfilter import *
text = u'すもももももももものうち'
tokenizer = Tokenizer(mmap=True)
token_filters = [POSKeepFilter('名詞'), TokenCountFilter(att='base_form')]
a = Analyzer(tokenizer=tokenizer, token_filters=token_filters)
for k, v in a.analyze(text):
    print('%s: %d' % (k, v))
예제 #17
0
import sys
from janome.analyzer import Analyzer
from janome.tokenfilter import *

msg = sys.argv[1]

filters = [POSKeepFilter(['名詞', '形容詞'])]

analyzer = Analyzer(token_filters=filters)

for t in analyzer.analyze(msg):
    print(
        f'phonetic = {t.phonetic}, reading = {t.reading}, surface = {t.surface}, part_of_speech = {t.part_of_speech}, base_form = {t.base_form}, infl_form = {t.infl_form}, infl_type = {t.infl_type}, node_type = {t.node_type}'
    )
    CompoundNounFilter(),
    POSKeepFilter(['名詞']),
    PartsOfSpeechFilter(['一般', '複合', '固有名詞']),
    LowerCaseFilter(),
    ExtractAttributeFilter('surface'),
    OneCharTokenFilter(),
]

tokenizer = Tokenizer("userdic.csv", udic_enc="utf8")

analyzer = Analyzer(char_filters, tokenizer, token_filters)

with open('input.csv', 'r') as f:
    """
    CSVファイルを読み込み、形態素に分解し名詞のみを抽出する
    """
    reader = csv.reader(f)

    for row in reader:
        text = row[0]
        nouns = [surface for surface in analyzer.analyze(text)]
        all_nouns = np.hstack((all_nouns, nouns))
"""
名詞ごとの個数を計算してCSVに出力する
"""
reshaped = np.vstack((all_nouns, np.ones(all_nouns.shape[0]))).transpose()
df = pd.DataFrame({'name': all_nouns, 'count': np.ones(all_nouns.shape[0])})
grouped = df.groupby('name')
grouped.sum().sort_values(['count'],
                          ascending=False).to_csv("janome_result.csv")
예제 #19
0
파일: corpus.py 프로젝트: bo-ri/wiki-scr
  tokenizer = Tokenizer()

  token_filters = [NumericReplaceFilter(),
                  CompoundNounFilter(),
                  POSKeepFilter(['名詞', '動詞', '形容詞', '副詞']),
                  LowerCaseFilter(),
                  OneCharacterReplaceFilter()
                ]

  analyzer = Analyzer(char_filters, tokenizer, token_filters)

  tokens_list = []
  raw_texts = []

  for text in texts:
    text_ = [token.base_form for token in analyzer.analyze(text)]
    if len(text_) > 0:
      tokens_list.append([token.base_form for token in analyzer.analyze(text)])
      raw_texts.append(text)

  raw_texts = [text_ + '\n' for text_ in raw_texts]
  with open(data_dir_path.joinpath(file_name.replace('.txt', '_cut.txt')), 'w',encoding='utf-8') as file:
    file.writelines(raw_texts)

  words = []
  for text in tokens_list:
    words.extend([word + '\n' for word in text if word != ''])
  with open(corpus_dir_path.joinpath(file_name.replace('.txt', '_word_list.txt')), 'w', encoding='utf-8') as file:
    file.writelines(words)
예제 #20
0
#     _ret = None
#     for token in tokens:
#       if _ret:
#         if token.part_of_speech.startswith(u'名詞') and _ret.part_of_speech.startswith(u'名詞'):
#           _ret.surface += token.surface
#           _ret.part_of_speech = u'名詞,複合,*,*'
#           _ret.base_form += token.base_form
#           _ret.reading += token.reading
#           _ret.phonetic += token.phonetic
#         else:
#           ret = _ret
#           _ret = token
#           yield ret
#       else:
#         _ret = token
#     if _ret:
#       yield _ret
# CompoundNounFilter(TokenFilter).apply()
# token_filters = [CompoundNounFilter()]
# text ="多分タピオカ8万粒は食べてきた笑タピオカってなんなん#台湾#台湾旅行#台湾グルメ#九份#十分夜市#台北101#龍山寺#中正紀念堂#謝謝台湾 場所: 台北,台湾"
# a = Analyzer(token_filters=token_filters)
tokens = b.analyze(text)

word_list = []

for token in tokens:
    word = token.surface
    word_list.append(word)

words_wakati = " ".join(word_list)
print(word_list)
def main():
    file = open('similarity_grouping_result.csv', 'w')

    writer = csv.writer(file, lineterminator='\n')
    writer.writerow(['frequent_word', 'total_count', 'synonym'])

    df = pd.read_csv('janome_result.csv')
    keys = df['name'].ravel()

    char_filters = [UnicodeNormalizeCharFilter()]
    token_filters = [
        POSKeepFilter(['名詞']),
        PartsOfSpeechFilter(['固有名詞']),
        ExtractAttributeFilter('surface'),
        OneCharTokenFilter()
    ]
    analyzer = Analyzer(char_filters, Tokenizer("userdic.csv",
                                                udic_enc="utf8"),
                        token_filters)

    print("Start. target keys: {}".format(len(keys)))

    # 全ての単語を類義語のデータベースに登録
    for index, row in df.sort_values('count', ascending=False).iterrows():
        db.add(normalize('NFKC', row['name']))

    for index, row in df.iterrows():
        name = row['name']
        if name not in keys:
            continue

        # 表記揺れと類義語を探す
        passed_list = []
        set_similarity_strings(passed_list, name)

        # 抜け漏れを少なくするため、部分一致で含まれるものを抽出する
        for word in copy.copy(passed_list):
            if len(word) <= 3:
                continue

            for key in keys:
                if key.find(word) >= 0 and key not in passed_list:
                    passed_list.extend([key])

        total_count = df[df['name'].isin(passed_list)]['count'].sum()
        combined_passed_name = ':'.join(passed_list)

        # 再頻出の単語を抽出する
        keywords = [
            surface for surface in analyzer.analyze(combined_passed_name)
        ]
        frequent_word = {'key': 'No Key', 'count': 0}
        for key in keywords:
            if len(word) < 3:
                continue

            count = combined_passed_name.count(key)
            if count > frequent_word['count']:
                frequent_word['key'] = key
                frequent_word['count'] = count


#         print("Count: {}, Names: {}".format(total_count, combined_passed_name))
        writer.writerow(
            [frequent_word['key'], total_count, combined_passed_name])
        keys = np.delete(keys, np.where(np.isin(keys, passed_list) == True))
    #     print("Grouping keys... size: {}, keys: {}. Unpassed keys... size: {}".format(len(passed_list), passed_list, len(keys)))

    file.close()
    print("End")
import itertools
from igraph import *

min_freq = 4

filename = './sample.txt'
data = [l.replace('。', '。\n') for l in open(filename, 'r', encoding='utf-8')]
sentenses = [re.sub(' ', '', u) for u in data if len(u) != 0]
print(len(sentenses))

tokenizer = Tokenizer()
char_filters = [UnicodeNormalizeCharFilter()]
token_filters = [CompoundNounFilter()
                 ]  #, POSStopFilter(['記号']), LowerCaseFilter()]
analyzer = Analyzer(char_filters, tokenizer, token_filters)
dt = [analyzer.analyze(s) for s in sentenses]
print(len(dt))
noums = [[item.surface for item in t if '名詞' in item.part_of_speech]
         for t in dt]
pairlist = [
    list(itertools.combinations(ns, 2)) for ns in noums if len(ns) >= 2
]

all_pairs = []
for u in pairlist:
    all_pairs.extend(u)

pcount = Counter(all_pairs)
print('pair frequency',
      sorted(pcount.items(), key=lambda x: x[1], reverse=True)[:30])
예제 #23
0
    nlp = NLP()
#    print(nlp.analyze('日本経済新聞社によると、関ジャニ∞の渋谷君が千代田区のスペイン村で豪遊した帰りに、六本木ヒルズの無国籍レストランで地中海料理かフランス料理か日本料理か迷ったらしいんだけど。'))
    while True:
        text = input('> ')
        if not text:
            break
        print()
        char_filters = [UnicodeNormalizeCharFilter()]
        tokenizer = Tokenizer()
        tokens = tokenizer.tokenize(text)
        print('形態素解析')
        for token in tokens:
            print(token)
        token_filters = [CompoundNounFilter(), POSStopFilter(['記号']), LowerCaseFilter()]
        a = Analyzer(char_filters, tokenizer, token_filters)
        print('\n複合語処理後')
        for token in a.analyze(text):
            print(token)
        tokens = [token for token in a.analyze(text)]
        scores = []
        for token in tokens:
            label, score = nlp.predict_emotion(token)
            if len(label) != 6: continue
            scores.append((label, score))
        scores = [np.array(item[1]) for item in scores]
        score = np.zeros(len(scores[0]))
        for s in scores: score += s
        score /= len(scores)
        print(score)
        chart(label, score)
예제 #24
0
def preprocess(all_df):
    # すべて欠損しているカラムは削除
    allnot_col = list(
        all_df.isnull().sum()[all_df.isnull().sum() == 19244].index)
    all_df = all_df.drop(columns=allnot_col)

    # 値がすべて同じカラムかつ欠損値を持たないカラムは削除
    one_col = list(all_df.nunique()[all_df.nunique() == 1].index)
    onefull_col = list(all_df[one_col].isnull().sum()[
        all_df[one_col].isnull().sum() == 0].index)
    all_df = all_df.drop(columns=onefull_col)

    # 重複データ
    train = all_df[all_df["応募数 合計"].notnull()]
    test = all_df[all_df["応募数 合計"].isnull()]
    train['応募数mean'] = train.groupby(["お仕事No."])["応募数 合計"].transform("mean")
    test["応募数mean"] = np.nan
    all_df = pd.concat([train, test], ignore_index=True, sort=False)
    # 全重複数もカラムとして残しておく
    all_df["all_count"] = all_df.groupby(["お仕事No."
                                          ])["お仕事No."].transform("count")
    train = all_df[all_df["応募数mean"].notnull()]
    test = all_df[all_df["応募数mean"].isnull()]
    train = train.drop(columns=["応募数 合計"])
    test = test.drop(columns=["応募数 合計"])
    train = train.drop_duplicates(subset=["お仕事No."])
    all_df = pd.concat([train, test], ignore_index=True, sort=False)

    # 欠損している数を表すカラムを追加
    all_df["NaN_num"] = all_df.isnull().sum(axis=1)

    # 欠損かどうかを表すカラムを追加
    no_df = pd.DataFrame(
        {
            "num":
            all_df.isnull().sum()[all_df.isnull().sum() > 0].values,
            "type":
            all_df[all_df.isnull().sum()[
                all_df.isnull().sum() > 0].index].dtypes
        },
        index=all_df.isnull().sum()[all_df.isnull().sum() > 0].index)
    for i in no_df.index:
        all_df["NaN_" + i] = np.where(all_df[i].isnull(), 1, 0)
    all_df = all_df.drop(columns=["(派遣先)概要 事業内容", "NaN_応募数mean"])

    # 男女比 女は削除
    all_df = all_df.drop(columns=["(派遣先)配属先部署 男女比 女", "NaN_(派遣先)配属先部署 男女比 女"])

    # 欠損値の補完
    no_df2 = pd.DataFrame(
        {
            "num":
            all_df.isnull().sum()[all_df.isnull().sum() > 0].values,
            "type":
            all_df[all_df.isnull().sum()[
                all_df.isnull().sum() > 0].index].dtypes
        },
        index=all_df.isnull().sum()[all_df.isnull().sum() > 0].index)
    # 欠損している数値カラム
    no_float_col = list(no_df2[no_df2["type"] != "object"].index)
    no_float_col.remove("応募数mean")
    # 欠損しているカテゴリカラム
    no_obj_col = list(no_df2[no_df2["type"] == "object"].index)
    # カテゴリ変数と考えられるものはNAで補完
    cols = ["(紹介予定)入社後の雇用形態", "勤務地 最寄駅2(駅からの交通手段)", "勤務地 最寄駅1(駅からの交通手段)"]
    for col in cols:
        all_df[col] = all_df[col].fillna("NA")
    # 数値変数として扱いたいものは-9999で補完¶
    cols2 = [
        "(派遣先)配属先部署 男女比 男", "(派遣先)配属先部署 人数", "勤務地 最寄駅1(分)", "(派遣先)配属先部署 平均年齢",
        "給与/交通費 給与上限", "勤務地 最寄駅2(分)"
    ]
    for col in cols2:
        all_df[col] = all_df[col].fillna(-9999)
    # 欠損値をNAで埋める
    for col in no_obj_col[:-1]:
        all_df[col] = all_df[col].fillna("NA")

    # 数値変数をカテゴリ変数に
    for col in ["フラグオプション選択", "職種コード", "会社概要 業界コード", "仕事の仕方", "勤務地 市区町村コード"]:
        all_df[col] = all_df[col].astype(str)

    all_df['掲載期間 開始日'] = pd.to_datetime(all_df['掲載期間 開始日'], format="%Y/%m/%d")
    all_df['掲載期間 終了日'] = pd.to_datetime(all_df['掲載期間 終了日'], format="%Y/%m/%d")
    all_df['期間・時間 勤務開始日'] = pd.to_datetime(all_df['期間・時間 勤務開始日'],
                                           format="%Y/%m/%d")

    # 掲載開始日と勤務開始日
    all_df['掲載期間 開始日'] = pd.to_datetime(all_df['掲載期間 開始日'], format="%Y/%m/%d")
    all_df['掲載期間 終了日'] = pd.to_datetime(all_df['掲載期間 終了日'], format="%Y/%m/%d")
    all_df['期間・時間 勤務開始日'] = pd.to_datetime(all_df['期間・時間 勤務開始日'],
                                           format="%Y/%m/%d")
    all_df["勤務開始-掲載開始"] = (all_df['期間・時間 勤務開始日'] - all_df['掲載期間 開始日'])
    all_df["勤務開始-掲載開始"] = all_df["勤務開始-掲載開始"].dt.days
    all_df = all_df.drop(columns=['掲載期間 開始日', "掲載期間 終了日", "期間・時間 勤務開始日"])
    # 勤務時間
    all_df["workingstart"] = all_df["期間・時間 勤務時間"].str.split("〜",
                                                            expand=True)[0]
    all_df["workingend"] = all_df["期間・時間 勤務時間"].str.split(
        "〜", expand=True)[1].str.split(" ", expand=True)[0]
    all_df["workingstart"] = pd.to_datetime(all_df['workingstart'],
                                            format='%H:%M')
    all_df["workingend"] = pd.to_datetime(all_df['workingend'], format='%H:%M')
    all_df["workingtime_m"] = (all_df["workingend"] -
                               all_df["workingstart"]).astype('timedelta64[m]')

    all_df["workingrest"] = all_df["期間・時間 勤務時間"].str.split(
        "休憩",
        expand=True)[1].str.split("分",
                                  expand=True)[0].str.split("<BR>",
                                                            expand=True)[0]
    all_df["workingrest"] = all_df["workingrest"].apply(
        lambda x: re.sub(r'\D', '', str(x)))
    all_df["workingrest"][all_df["workingrest"] == ""] = "0"
    all_df["workingrest"][all_df["workingrest"] == "1"] = "60"
    all_df["workingrest"][all_df["workingrest"] == "6090"] = "75"
    all_df["workingrest"][all_df["workingrest"] == "13301600"] = "0"
    all_df["workingrest"][all_df["workingrest"] == "10301330"] = "0"
    all_df["workingrest"] = all_df["workingrest"].apply(int)
    all_df["productiontime_m"] = (all_df["workingtime_m"] -
                                  all_df["workingrest"]
                                  )  #.astype('timedelta64[m]')
    for i in list(all_df.dtypes[all_df.dtypes == "datetime64[ns]"].index):
        all_df[i] = all_df[i].astype(str)

    # NaN_期間・時間 備考を追加したので削除
    all_df = all_df.drop(columns=["期間・時間 備考"])

    #言語処理
    token_df = pd.read_pickle("pickles/train_token.pickle")
    #お仕事名
    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(
            r"[!$%&\'()*+,-./:;<=>?@\\^_`{|}~◆▼★②●☆■★【】『』「」、♪≪≫]", " ")
    ]
    token_filters = [
        CompoundNounFilter(),
        POSKeepFilter(['名詞']),
        LowerCaseFilter()
    ]
    a = Analyzer(char_filters=char_filters, token_filters=token_filters)
    length = all_df["応募数mean"].notnull().sum()
    all_df['お仕事名_token'] = np.nan
    all_df['お仕事名_token'][:length] = token_df["お仕事名_token"]
    all_df['お仕事名_token'][length:] = all_df["お仕事名"][length:].apply(
        lambda x: " ".join([token.surface for token in a.analyze(x)]))
    #all_df['お仕事名_token'] = token_df["お仕事名_token"]

    with open("pickles/grid_1.pickle", mode="rb") as ff:
        model = pickle.load(ff)
    all_df["お仕事名_pred"] = model.predict(all_df['お仕事名_token'].values)
    all_df = all_df.drop(columns=['お仕事名_token'])

    #仕事内容
    select_conditions = ['名詞']
    tagger = MeCab.Tagger('')
    tagger.parse('')

    def wakati_text(text):
        node = tagger.parseToNode(text)
        terms = []
        while node:
            term = node.surface
            pos = node.feature.split(',')[0]
            if pos in select_conditions:
                terms.append(term)
            node = node.next
        text_result = ' '.join(terms)
        return text_result

    length = all_df["応募数mean"].notnull().sum()
    all_df['仕事内容_token'] = np.nan
    all_df['仕事内容_token'][:length] = token_df["仕事内容_token"]
    all_df['仕事内容_token'][length:] = all_df["仕事内容"][length:].apply(wakati_text)
    #all_df['仕事内容_token'] = token_df["仕事内容_token"]

    with open("pickles/grid_2.pickle", mode="rb") as ff:
        model = pickle.load(ff)
    all_df["仕事内容_pred"] = model.predict(all_df['仕事内容_token'])
    all_df = all_df.drop(columns=['仕事内容_token'])

    #お仕事のポイント(仕事PR)
    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(
            r"[!$%&\'()*+,-./:;<=>?@\\^_`{|}~◆▼★②●☆■★【】『』「」、♪≪≫]", " ")
    ]
    token_filters = [
        CompoundNounFilter(),
        POSKeepFilter(['名詞']),
        LowerCaseFilter()
    ]
    a = Analyzer(char_filters=char_filters, token_filters=token_filters)
    length = all_df["応募数mean"].notnull().sum()
    print(length)
    all_df['お仕事のポイント_token'] = np.nan
    all_df['お仕事のポイント_token'][:length] = token_df["お仕事のポイント_token"]
    all_df['お仕事のポイント_token'][length:] = all_df["お仕事のポイント(仕事PR)"][
        length:].apply(
            lambda x: " ".join([token.surface for token in a.analyze(x)]))
    #all_df['お仕事のポイント_token'] = token_df["お仕事のポイント_token"]

    with open("pickles/grid_3.pickle", mode="rb") as ff:
        model = pickle.load(ff)
    all_df["お仕事のポイント_pred"] = model.predict(all_df['お仕事のポイント_token'].values)
    all_df = all_df.drop(columns=['お仕事のポイント_token'])

    # (派遣先)配属先部署
    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(
            r"[!$%&\'()*+,-./:;<=>?@\\^_`{|}~◆▼★②●☆■★【】『』「」、♪≪≫]", " ")
    ]
    token_filters = [POSKeepFilter(['名詞']), LowerCaseFilter()]
    a = Analyzer(char_filters=char_filters, token_filters=token_filters)
    length = all_df["応募数mean"].notnull().sum()
    all_df['(派遣先)配属先部署_token'] = np.nan
    all_df['(派遣先)配属先部署_token'][:length] = token_df["(派遣先)配属先部署_token"]
    all_df['(派遣先)配属先部署_token'][length:] = all_df["(派遣先)配属先部署"][length:].apply(
        lambda x: " ".join([token.surface for token in a.analyze(x)]))
    #all_df['(派遣先)配属先部署_token'] = token_df["(派遣先)配属先部署_token"]

    with open("pickles/grid_4.pickle", mode="rb") as ff:
        model = pickle.load(ff)
    all_df["(派遣先)配属先部署_pred"] = model.predict(all_df['(派遣先)配属先部署_token'])
    all_df = all_df.drop(columns=['(派遣先)配属先部署_token'])

    # (派遣先)職場の雰囲気
    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(
            r"[!$%&\'()*+,-./:;<=>?@\\^_`{|}~◆▼★②●☆■★【】『』「」、♪≪≫]", " ")
    ]
    token_filters = [
        CompoundNounFilter(),
        POSKeepFilter(['名詞']),
        LowerCaseFilter()
    ]
    a = Analyzer(char_filters=char_filters, token_filters=token_filters)
    length = all_df["応募数mean"].notnull().sum()
    all_df['(派遣先)職場の雰囲気_token'] = np.nan
    all_df['(派遣先)職場の雰囲気_token'][:length] = token_df["(派遣先)職場の雰囲気_token"]
    all_df['(派遣先)職場の雰囲気_token'][length:] = all_df["(派遣先)職場の雰囲気"][
        length:].apply(
            lambda x: " ".join([token.surface for token in a.analyze(x)]))
    #all_df['(派遣先)職場の雰囲気_token'] = token_df["(派遣先)職場の雰囲気_token"]

    with open("pickles/grid_5.pickle", mode="rb") as ff:
        model = pickle.load(ff)
    all_df["(派遣先)職場の雰囲気_pred"] = model.predict(all_df['(派遣先)職場の雰囲気_token'])

    with open("pickles/lda_5.pickle", mode="rb") as ff:
        lda = pickle.load(ff)
    X_lda = lda.transform(all_df["(派遣先)職場の雰囲気_token"])
    all_df["(派遣先)職場の雰囲気_lda"] = X_lda.argmax(axis=1)
    all_df = all_df.drop(columns=['(派遣先)職場の雰囲気_token'])

    all_df = all_df.drop(columns=[
        "お仕事名", "仕事内容", "お仕事のポイント(仕事PR)", "(派遣先)配属先部署", "(派遣先)職場の雰囲気"
    ])

    # Label Encoding
    cat_cols = list(all_df.dtypes[all_df.dtypes == "object"].index)
    for col in cat_cols:
        le = LabelEncoder()
        all_df[col] = le.fit_transform(all_df[col].apply(lambda x: str(x)))

    return all_df
예제 #25
0
    ExtractAttributeFilter("base_form")
]
analyzer = Analyzer(char_filters, tokenizer, token_filters)

#単語抽出とストップワード

stopwords = []
url = "http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt"

with urllib.request.urlopen(url) as response:
    stopwords = [w for w in response.read().decode().split('\r\n') if w != ""]

texts_words = {}

for k, v in texts.items():
    texts_words[k] = [w for w in analyzer.analyze(v)]

#辞書
dictionary = gensim.corpora.Dictionary(texts_words.values())
dictionary.filter_extremes(no_below=3, no_above=0.4)

#コーパス
corpus = [dictionary.doc2bow(words) for words in texts_words.values()]

#LDA Model 教師なし
lda = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                      num_topics=4,
                                      id2word=dictionary,
                                      random_state=1)
print('topics: {}'.format(lda.show_topics()))
예제 #26
0
news_df = article_df[0:].reset_index(drop = True)

from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
import numpy as np

t = Tokenizer()
char_filters = [UnicodeNormalizeCharFilter()]
analyzer = Analyzer(char_filters, t)

word_lists = []
words = []

for i, row in news_df.iterrows():
    for t in analyzer.analyze(row[0]):
        #形態素
        surf = t.surface
        words.append([surf])
        #基本形
        base = t.base_form
        #品詞
        pos = t.part_of_speech
        #読み
        reading = t.reading

word_df = pd.DataFrame(word_lists, columns = ['単語'])
score_result = word_df


예제 #27
0
# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *

import logging
logging.basicConfig(level='INFO')

print(u'Analyzer example:')
text = u'蛇の目はPure Pythonな形態素解析器です。'
char_filters = [
    UnicodeNormalizeCharFilter(),
    RegexReplaceCharFilter(u'蛇の目', u'janome')
]
tokenizer = Tokenizer()
token_filters = [CompoundNounFilter(), LowerCaseFilter()]
a = Analyzer(char_filters, tokenizer, token_filters)
for token in a.analyze(text):
    print(token)

print('')
print(u'Analyzer example: Count nouns with POSKeepFilter and TokenCountFilter')
text = u'すもももももももものうち'
token_filters = [POSKeepFilter(u'名詞'), TokenCountFilter()]
a = Analyzer(token_filters=token_filters)
for k, v in a.analyze(text):
    print('%s: %d' % (k, v))
예제 #28
0
    makefile = input('> ')

    #自分で名前を記入したtxtデータに加工データがある。

    gettwitterdata(keyword2, dfile)
    re_1(dfile, makefile)

    print("おそらく書き込みは完了")
    print("データを解析中だと考える。")

    #加工データを変数に入れて、その変数を単語頻度のjanomeに投げるやつ。
    #変数はsとする
    #makefile2は任意に入力したtextデータタイトルをテキストデータとしての文字列に変更したもの
    makefile2 = "" + makefile + ".txt"
    f = open(makefile2)
    s = f.read()
    f.close()

    a = Analyzer(token_filters=[POSKeepFilter(['名詞']), TokenCountFilter()])

    g_count = a.analyze(s)
    #リスト化させる。
    l_count = list(a.analyze(s))
    #print(type(g_count))
    print(type(l_count))
    # <class 'generator'>
    #全て表記させる。
    #for i in g_count:
    #   print(i)
    print(l_count)
예제 #29
0
def sep(texts):
    token_filters = [POSKeepFilter('名詞'), TokenCountFilter()]
    a = Analyzer(token_filters=token_filters)
    return a.analyze(texts)
예제 #30
0
                RegexReplaceCharFilter('<.*?>', '')]
# mainly keep language which is adjective(= "形容詞" in japanese) 
token_filters = [POSKeepFilter(["形容詞"]),
                 LowerCaseFilter(),
                 ExtractAttributeFilter('surface')]
a = Analyzer(char_filters=char_filters, token_filters=token_filters)

# for example) split data by result of final(last) interview
copyp=df01New[df01New["PF_L"]==2].reset_index() # pass
copyn=df01New[df01New["PF_L"]==1].reset_index() # resign
copyf=df01New[df01New["PF_L"]==0].reset_index() # fail

# pass = result of final(last) interview
resultp=[]
for i in range(copyp.shape[0]):
    for x in a.analyze(copyp['value'][i]):
        resultp.append(x)
# resign = result of final(last) interview
resultn=[]
for i in range(copyn.shape[0]):
     for x in a.analyze(copyn["value"][i]):
        resultn.append(x)
# fail = result of final(last) interview
resultf=[]
for i in range(copyf.shape[0]):
    for x in a.analyze(copyf["value"][i]):
        resultf.append(x)
        
    # make dictionary
from collections import Counter
예제 #31
0
from janome.analyzer import Analyzer
from janome.charfilter import *

uzai_data = pd.read_csv('../data/uzai_data.csv')
uzai_data = uzai_data.rename(columns={'Unnamed: 0': 'index'})

user_dict = UserDictionary('neologd.csv', 'utf8', 'ipadic', sysdic.connections)
user_dict.save('neologd')

t = Tokenizer(udic='userdic.csv', udic_enc='utf8')
char_filters = [UnicodeNormalizeCharFilter()]
analyzer = Analyzer(char_filters=char_filters, tokenizer=t)

uzai_words_list = []
for i, row in uzai_data.iterrows():
    for t in analyzer.analyze(row[4]):
        surf = t.surface  #形態素
        base = t.base_form  #基本形
        pos = t.part_of_speech  #品詞
        reading = t.reading  #読み
        phonetic = t.phonetic  #振り仮名

        uzai_words_list.append([i, surf, base, pos, reading, phonetic])

uzai_words_list = pd.DataFrame(
    uzai_words_list, columns=['index', '単語', '基本形', '品詞', '読み', '振り仮名'])

uzai_morpheme_data = pd.merge(uzai_data,
                              uzai_words_list,
                              how='left',
                              on='index')