示例#1
0
def word_frequencies(text):
    from manabi.apps.reading_level.word_frequencies import WORD_FREQUENCIES

    mecab = MeCab()
    frequencies = []
    for node in mecab.parse(text.encode('utf8'), as_nodes=True):
        frequency = WORD_FREQUENCIES.get(node.surface.decode('utf8'))
        if frequency is None:
            continue
        frequencies.append(frequency)
    return frequencies
示例#2
0
class MecabTagger(object):
    """docstring, MecabTagger"""
    # TAGSET = set("""NNG NNP NNB NNBC NR NP VV VA VX VCP VCN MM MAG MAJ IC
    #                 JKS JKC JKG JKO JKB JKV JKQ JX JC EP EF EC ETN ETM
    #                 XPN XSN XSV XSA XR SF SE SSO SSC SC SY SL SH SN
    #                 UNKNOWN EOS""".split())

    def __init__(self, **kwargs):
        self.tagger = MeCab(kwargs)

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        del self.tagger

    @staticmethod
    def tagged_tuple(node):
        surface = node.surface
        features = node.feature.split(',')
        first_pos = features[0].partition('+')[0]
        lemma = (features[7].partition('/')[0]
                    if features[4].startswith('Inflect')
                    else surface.lower())
        return Word(decode(surface, True), decode(lemma, True),
                    first_pos.decode('ascii'), node.cost)

    def parse(self, text):  # follow NLTK naming
        return [MecabTagger.tagged_tuple(node)
                    for node in self.tagger.parse(text.encode(settings.DEFAULT_ENCODING), as_nodes=True)
                        if not node.is_eos()]
示例#3
0
if __name__ == '__main__':
    path = '../data/hanreiDB'
    vocab = defaultdict(lambda: len(vocab))

    # open the DB
    db = hanrei_db.SQLite3(path)
    cur = db.open_db()

    # read the data
    sql = "select id, syubunPart from hanrei where id<=150"
    #	sql = u"select id, syubunPart, riyuPart from hanrei"
    rows = db.exe_to_db(cur, sql)
    train_data = []
    test_data = []
    nm = MeCab()
    for doc_id, syubunPart in rows:
        print "--------------"
        print "id:", doc_id
        # 改行、空白削除
        syubunPart = re.sub(r'(\n|\t| | )', '', syubunPart)
        # 文分割
        sensp = sensplit.SenSplit(syubunPart)
        syubun_list = sensp()

        for sentence in syubun_list:
            if sentence == '':
                continue
            morph_list = []  # 文を形態素で分割したリスト
            sentence = sentence.encode('utf_8')  # unicode→str(utf-8)
            for n in nm.parse(sentence, as_nodes=True):
def main():
    parser = argparse.ArgumentParser(
        description='Chainer example: convolutional seq2seq')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=48,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=100,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=512,
                        help='Number of units')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=6,
                        help='Number of layers')
    parser.add_argument('--head',
                        type=int,
                        default=8,
                        help='Number of heads in attention mechanism')
    parser.add_argument('--dropout',
                        '-d',
                        type=float,
                        default=0.1,
                        help='Dropout rate')
    parser.add_argument('--model', type=str, help='trained model')
    parser.add_argument('--input',
                        '-i',
                        type=str,
                        default='./',
                        help='Input directory')
    parser.add_argument('--source',
                        '-s',
                        type=str,
                        default='europarl-v7.fr-en.en',
                        help='Filename of train data for source language')
    parser.add_argument('--target',
                        '-t',
                        type=str,
                        default='europarl-v7.fr-en.fr',
                        help='Filename of train data for target language')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--source-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of source language')
    parser.add_argument('--target-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of target language')
    parser.add_argument('--no-bleu',
                        '-no-bleu',
                        action='store_true',
                        help='Skip BLEU calculation')
    parser.add_argument('--use-label-smoothing',
                        action='store_true',
                        help='Use label smoothing for cross entropy')
    parser.add_argument('--embed-position',
                        action='store_true',
                        help='Use position embedding rather than sinusoid')
    parser.add_argument('--use-fixed-lr',
                        action='store_true',
                        help='Use fixed learning rate rather than the ' +
                        'annealing proposed in the paper')
    parser.add_argument('--disable-mecab',
                        '--dm',
                        action='store_true',
                        help='disalbe mecab toknize')
    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=4))

    # Check file
    en_path = os.path.join(args.input, args.source)
    source_vocab = ['<eos>', '<unk>', '<bos>'] + \
        preprocess.count_words(en_path, args.source_vocab)
    source_data = preprocess.make_dataset(en_path, source_vocab)
    fr_path = os.path.join(args.input, args.target)
    target_vocab = ['<eos>', '<unk>', '<bos>'] + \
        preprocess.count_words(fr_path, args.target_vocab)
    # print('Original training data size: %d' % len(source_data))
    # print('Filtered training data size: %d' % len(train_data))

    source_ids = {word: index for index, word in enumerate(source_vocab)}
    target_ids = {word: index for index, word in enumerate(target_vocab)}

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    m = MeCab('-Owakati')

    # Define Model
    model = net.Transformer(args.layer,
                            min(len(source_ids), len(source_words)),
                            min(len(target_ids), len(target_words)),
                            args.unit,
                            h=args.head,
                            dropout=args.dropout,
                            max_length=500,
                            use_label_smoothing=args.use_label_smoothing,
                            embed_position=args.embed_position)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    chainer.serializers.load_npz(args.model, model)

    def translate_one(source, target):
        words = preprocess.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array([source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x], beam=5)[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    def tokenize(source, target):
        if args.disable_mecab:
            return source, target
        return m.parse(source), m.parse(target)

    while True:
        source = input('source> ')
        target = input('target> ')
        source, target = tokenize(source, target)
        translate_one(source, target)
示例#5
0
 def __init__(self, vocab: Vocab):
     self.vocab = vocab
     MeCab = try_mecab_import()  # type: ignore[func-returns-value]
     self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
def tokenize(mecab: MeCab, sentence: str) -> List[str]:
    return [
        node.surface for node in mecab.parse(sentence, as_nodes=True)
        if node.surface
    ]
示例#7
0
 def _non_wrapped_insert_mode(self, session, *, is_develop_mode=True):
     with MeCab() as mecab:
         manalysis.insert(session, mecab, is_develop_mode=is_develop_mode)
     morph.insert(session, is_develop_mode=is_develop_mode)
示例#8
0
 def __init__(self, **kwargs):
     self.tagger = MeCab(kwargs)
示例#9
0
# -*- coding: utf-8 -*-

from natto import MeCab

mc = MeCab()

# テキストは cookbiz.jp より
text = "お仕事については基本的には店舗に配属してからのOJTが中心となりますが、先輩スタッフがしっかりとサポートしてくれるので、どなたも安心してお仕事していただけます。2013年には本社内に開発室を設置。店舗配属前にもトレーニングを行なってから実際の店舗に配属されるなど、サポート体制がしっかりと整っているのも当社の魅力。実際、経験が浅い方や未経験スタートのスタッフも多数活躍中!"

print ('Input text:\n'+text)

print('====================================================')

# -F / --node-format オプションでノードの出力フォーマットを指定する
#
# %m    ... 形態素の表層文
# %f[0] ... 品詞
# %h    ... 品詞 ID (IPADIC)
# %f[8] ... 発音
#
words = []
with MeCab('-F%m,%f[0],%h') as nm:
    for n in nm.parse(text, as_nodes=True):
        node = n.feature.split(',');
        if len(node) != 3:
            continue
        if node[1] == '名詞':
            # if True:
            words.append(node[0])
print(words)
示例#10
0
文件: rnnlm.py 项目: dkubo/legalNLP
if __name__ == '__main__':
	path = '../data/hanreiDB'
	vocab = defaultdict(lambda: len(vocab))

# open the DB
	db = hanrei_db.SQLite3(path)
	cur = db.open_db()

# read the data
	sql = "select id, syubunPart from hanrei where id<=150"
#	sql = u"select id, syubunPart, riyuPart from hanrei"
	rows = db.exe_to_db(cur, sql)
	train_data = []
	test_data = []
	nm = MeCab()
	for doc_id, syubunPart in rows:
		print "--------------"
		print "id:", doc_id
	# 改行、空白削除
		syubunPart = re.sub(r'(\n|\t| | )', '', syubunPart)
	# 文分割
		sensp = sensplit.SenSplit(syubunPart)
		syubun_list = sensp()
		
		for sentence in syubun_list:
			if sentence == '':
				continue
			morph_list = []		# 文を形態素で分割したリスト
			sentence = sentence.encode('utf_8')	# unicode→str(utf-8)
			for n in nm.parse(sentence, as_nodes=True):
示例#11
0
                        node.surface,  #表層
                        feature[0],  #品詞1
                        feature[1],  #品詞2     
                        feature[2],  #品詞3
                        feature[3],  #品詞4
                        feature[6],  #原型
                        node.cost,  #コスト
                        node.posid  #品詞番号
                    ],
                    index=df.columns)
                df = df.append(series, ignore_index=True)
    return df


if __name__ == '__main__':
    parser = MeCab(
        "-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")
    #parser = MeCab()
    nodes = parser.parse("私は人間です。", as_nodes=True)

    for node in nodes:
        if not node.is_eos():
            print(node.surface, " : ", node.feature, node.cost, node.posid)

    text = """
        私は人間です。
        呼吸と食事ができます。
        私は望遠鏡で泳ぐ少女を見た。
    """
    df = mecab_parse2df(text, parser)
    print(df)
示例#12
0
 def __init__(self):
     self.mc = MeCab()
示例#13
0
# author: alex
from natto import MeCab

# 31 + 32
with open("verbs.txt", "w+"):
    pass
text = open("neko.txt", "r+")
res_file = open("verbs.txt", "a+")
reader = text.readlines()
for line in reader:
    with MeCab('-F%f[0],%f[6]') as nm:
        for n in nm.parse(line, as_nodes=True):
            if not n.is_eos() and n.is_nor():
                klass, word = n.feature.split(',', 1)
                if klass in ['動詞']:  #['名詞', '形容詞', '形容動詞','動詞']:
                    print word
                    res_file.write(word + ' ')
                    res_file.write('\n')
text.close()
res_file.close()

# 33
with open("neko_hen.txt", "w+"):
    pass
text = open("neko.txt", "r+")
res_file = open("neko_hen.txt", "a+")
reader = text.readlines()
for line in reader:
    with MeCab('-F%f[1],%f[6]') as nm:
        for n in nm.parse(line, as_nodes=True):
            if not n.is_eos() and n.is_nor():
示例#14
0
"""
Sample program for perse sites sentences to generate keywords for them.
"""
from natto import MeCab
nm = MeCab()
text = "this is a test"
print nm.parse(text)
示例#15
0
# -*- coding:utf-8 -*-

from natto import MeCab
import collections as cl
import random
from bs4 import BeautifulSoup

import Lib_py.params as params
import Lib_py.study as study

# global instance
mc_g = MeCab('-F%m,%f[0],%h')


def get_next_search_candidate(html_src):
    target_str = get_str_from_http(html_src)
    if len(target_str) == 0:
        raise ("can't extract strings from html source")
    parsed_words_l = get_words_list(target_str)
    print(parsed_words_l)
    study.dump_get_words(parsed_words_l)
    common_words_l = study.get_common_words()
    common_removed_words_l = [
        w for w in parsed_words_l if not w in common_words_l
    ]
    freq_words_l = get_freq_words_list(common_removed_words_l)
    return select_next_search_word_candidate(freq_words_l)


def get_str_from_http(html_src):
    soup = BeautifulSoup(html_src)
示例#16
0
文件: bow.py 项目: spliew/naive_bayes
# -*- coding: utf-8 -*-

from natto import MeCab
from numpy import array
from gensim import corpora, matutils

nm = MeCab('-F%m,%f[0],%h')


def get_meishi(sentence):
    """名詞だけを取り出す。
    :param sentence: String
    :return words: list of String.

    入力例) get_meishi("ピンチ、ピンチの時には必ずヒーローが現れる。")
    ==> ['ピンチ', 'ピンチ', '時', 'ヒーロー']
    """
    # -F / --node-format オプションでノードの出力フォーマットを指定する
    #
    # %m    ... 形態素の表層文
    # %f[0] ... 品詞
    # %h    ... 品詞 ID (IPADIC)
    # %f[8] ... 発音
    #
    words = []

    for n in nm.parse(sentence, as_nodes=True):
        node = n.feature.split(',')
        if len(node) != 3:
            continue
        if node[1] == '名詞':
示例#17
0
import os
from natto import MeCab
# bug of wordsを作成するためのライブラリ読み込み
from gensim import corpora, matutils

mc = MeCab()
txt_word_list = []

# テキストファイルを格納しているフォルダを読み込み
files = os.listdir(os.path.dirname(__file__)+'/path/txt')

# フォルダ配下のテキストファイルを1つずつ読み込み
for file in files:

  # テキストファイルから名詞と動詞の単語を取り出したリスト作成(Q11-1の処理と同じ)
  with open(os.path.dirname(__file__) + '/path/txt/'+file, 'r') as f:
    txt = f.read()
    word_list = []
    for n in mc.parse(txt, as_nodes=True):
      if not (n.is_bos() or n.is_eos()):
        part, word = n.feature.split(',', 1)
      if part == "名詞" or part == "動詞":
        word_list.append(n.surface)

  # テキストファイルごとの単語リストを追加
  txt_word_list.append(word_list)

# bug of wordsを作成するため全種類の単語を把握し、単語IDを付与した辞書を作成
corpus_dic = corpora.Dictionary(txt_word_list)

# 各文章の単語リストをコーパス(辞書の単語IDと単語の出現回数)リストに変換
示例#18
0
def main():
    nm = MeCab('-Owakati')
    word = "MeCabは 京都大学情報学研究科−日本電信電話株式会社コミュニケーション科学基礎研究所 共同研究ユニットプロジェクトを通じて開発されたオープンソース 形態素解析エンジンです。 言語, 辞書,コーパスに依存しない汎用的な設計を 基本方針としています。 パラメータの推定に Conditional Random Fields (CRF) を用 いており, ChaSenが採用している 隠れマルコフモデルに比べ性能が向上しています。また、平均的に ChaSen, Juman, KAKASIより高速に動作します。 ちなみに和布蕪(めかぶ)は, 作者の好物です。"
    print(nm.parse(word))
    lis = [n.surface for n in nm.parse(word, as_nodes=True) if n.is_nor()]
    print(lis)
# coding:utf-8
import pandas as pd
from natto import MeCab
mc = MeCab()

select = pd.read_csv('./input/keyword.csv', encoding='SHIFT-JIS', header=None)
select = select[0].values.tolist()

tango_retu = []
score_retu = []

# 日本語評価極性辞書(用言編)ver.1.0(2008年12月版)
# ポジの用語は 1 ,ネガの用語は -1 と数値化する
with open("./dictionary/wago.121808.pn.txt", 'r') as f:
    for l in f.readlines():
        l = l.split('\t')
        l[1] = l[1].replace(" ", "").replace('\n', '')
        value = 1 if l[0].split('(')[0] == "ポジ" else -1

        tango_retu.append(l[1])
        score_retu.append(value)

wago_dic = dict(zip(tango_retu, score_retu))

tango_retu = []
score_retu = []

# 日本語評価極性辞書(名詞編)ver.1.0(2008年12月版)
# pの用語は 1 eの用語は 0 ,nの用語は -1 と数値化する
with open("./dictionary/pn.csv.m3.120408.trim", 'r') as f:
    for l in f.readlines():
# -*- coding: utf-8 -*-

from sklearn.feature_extraction.text import CountVectorizer
from natto import MeCab


_morpheme_type = ['NNG', 'NNP']
_escape_pattern = ['\n']
_nm = MeCab()


def filter_by_type(text):
    terms = []
    for term_info in str(_nm.parse(text)).split('\n'):
        _term_info = term_info.split('\t')
        if len(_term_info) < 2:
            continue
        surface = _term_info[0]
        analysis = _term_info[1].split(',')
        if analysis[0] in _morpheme_type:
            terms.append(surface)
    return terms


def generate_corpus2(data_path):
    _corpus = []
    fp = open(data_path, 'r')
    for line in fp.readlines():
        if line not in _escape_pattern:
            terms = filter_by_type(line)
            _corpus.append(' '.join(terms))
示例#21
0
from natto import MeCab
import os
from gensim import corpora

mc = MeCab()

txt_list = []
files = os.listdir(os.path.dirname(__file__) + '/path/txt')
for file in files:
    with open(os.path.dirname(__file__) + '/path/txt/' + file, 'r') as f:
        txt = f.read()
        word_list = []
        for n in mc.parse(txt, as_nodes=True):
            if not (n.is_bos() or n.is_eos()):
                part, word = n.feature.split(',', 1)
                if part == "名詞" or part == "動詞":
                    word_list.append(n.surface)
        txt_list.append(word_list)

dictionary = corpora.Dictionary(txt_list)
corpus_list = [dictionary.doc2bow(txt) for txt in txt_list]

# 下の行から本書スタート
from gensim import matutils, models

# corpus_listを準備するコードは省略

# TF-IDFのモデルを生成
tfidf_model = models.TfidfModel(corpus_list, normalize=True)

# corpusにTF-IDFを適用
示例#22
0
# MeCabをPythonで利用するためのライブラリ読み込み
import os
from natto import MeCab

# merosには、メロスの文章データが格納
# MeCabを実行するオブジェクトを生成
mc = MeCab()

# 下記のコードはテキスト時は、下記のようにする
with open(os.path.dirname(__file__) + '/path/txt/meros.txt', 'r') as f:
  txt = f.read()

word_list = []
# MeCabを用いて、形態素解析を実行
for part_and_word in mc.parse(txt, as_nodes=True):

  # 形態素解析結果のpart_and_wordが開始/終了オブジェクトでないことを判定
  if not (part_and_word.is_bos() or part_and_word.is_eos()):

    # 形態素解析結果から品詞と単語を取得
    part, word = part_and_word.feature.split(',', 1)

    # 名詞と動詞の単語を抽出
    if part == '名詞' or part == '動詞':
      word_list.append(part_and_word.surface)
示例#23
0
# %%
from IPython.display import HTML
from natto import MeCab

nm = MeCab()
a = ""
text = "こんにちは!野球は走る、打つ、投げるスポーツです。"
print(text)

with MeCab('-F%m,%f[0],%h,%f[8]') as nm:
    for n in nm.parse(text, as_nodes=True):
        lis = n.feature.split(",")
        try:
            if lis[1] == "動詞":
                b = (
                    "<span style='background-color:#ffcc99'>{0}</span>".format(
                        lis[0]))
            else:
                b = (
                    "<span style='background-color:#ffffff'>{0}</span>".format(
                        lis[0]))
            a = a + b
        except IndexError:
            pass

display(HTML(a))