예제 #1
0
def preprocess(doc, debug=False):
    """
    ドキュメントを引数にとってそれを前処理した上でトークナイズされた文のリストに分割する
    @param doc 対象のドキュメント
    @return 前処理されたドキュメントに含まれる文のリスト
    """

    doc = doc.lower()

    lines = re.split("\n|。", doc)
    lines = list(filter(lambda x: x != "", map(lambda x: x.strip(), lines)))
    sentences = copy.deepcopy(lines)
    lines = list(map(lambda x: mojimoji.zen_to_han(x), lines))

    analyzer = Analyzer([
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(r'[(\)、。「」]', ' ')
    ], JanomeTokenizer(), [
        POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
        ExtractAttributeFilter('base_form')
    ])
    corpus = [' '.join(analyzer.analyze(l)) + '。' for l in lines]
    if debug:
        print("\n".join(corpus))

    return sentences, corpus
예제 #2
0
def janome_document_summarize(document):
    # 形態素解析(単語単位に分割する)
    analyzer = Analyzer(char_filters=[
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(r'[(\)「」、。]', ' ')
    ],
                        tokenizer=JanomeTokenizer(),
                        token_filters=[
                            POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
                            ExtractAttributeFilter('base_form')
                        ])

    text = re.findall("[^。]+。?", document.replace('\n', ''))
    corpus = [' '.join(analyzer.analyze(sentence)) + u'。' for sentence in text]
    parser = PlaintextParser.from_string(''.join(corpus),
                                         Tokenizer('japanese'))
    summarizer = LexRankSummarizer()
    summarizer.stop_words = [' ', '。', '\n']
    N = int(len(corpus) / 10 * 3)
    if N <= 0: N = 3
    summary = summarizer(document=parser.document, sentences_count=N)

    rst = ''
    print('\n要約:')
    for sentence in summary:
        print(text[corpus.index(sentence.__str__())])
        rst += text[corpus.index(sentence.__str__())]
    return summary, rst
예제 #3
0
    def __init__(self, save_dir="result/"):
        self.save_dir = save_dir
        # 解析器
        token_filters = [POSKeepFilter(['名詞']),
                         NumericReplaceFilter(),
                         TokenCountFilter()]
        char_filters = [UnicodeNormalizeCharFilter()]
        self.ana = Analyzer(char_filters=char_filters,
                            token_filters=token_filters)
        # 個人ごとの解析結果を入れる
        self.noun = {}
        self.emoji = {}

        # サニタイズフィルタ
        self.filter = []
        with codecs.open(FILTER_FILE, "r", "utf-8") as rf:
            for line in rf.readlines():
                self.filter.append(line.rstrip())

        # Word Cloud
        self.wc = wordcloud.WordCloud(
            background_color="white",
            max_words=self.wc_max_words,
            max_font_size=550,
            width=1920,
            height=1080,
            # mask=mask,
            font_path=FONT_PATH)
예제 #4
0
def make_corpus(docs, debug=False):
    """
    複数の文書からコーパスを作成する
    @docs 文書のリスト
    @return トークナイズされた文書のリスト
    """
    docs = list(
        map(
            lambda d: list(
                filter(lambda x: x.strip() != "", re.split("\n|。", d.lower()))
            ), docs))

    docs = [
        list(map(lambda x: mojimoji.zen_to_han(x), lines)) for lines in docs
    ]

    analyzer = Analyzer([
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(r'[(\)、。「」]', ' ')
    ], JanomeTokenizer(), [
        POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
        ExtractAttributeFilter('base_form')
    ])

    corpus = [
        list(
            itertools.chain.from_iterable(
                [list(analyzer.analyze(l)) for l in lines])) for lines in docs
    ]

    if debug:
        print("\n".join(corpus))

    return corpus
예제 #5
0
파일: LexRank.py 프로젝트: mkdk09/LexRank
    def tense_analyze(self, text, sentences_count):
        # 1行1文となっているため、改行コードで分離
        # sentences = [t for t in text.split('\n')]
        sentences = [t for t in text.split('。')]

        # 形態素解析器を作る
        analyzer = Analyzer(
            [
                UnicodeNormalizeCharFilter(),
                RegexReplaceCharFilter(r'[(\)「」、。]', ' ')
            ],  # ()「」、。は全>てスペースに置き換える
            JanomeTokenizer(),
            [
                POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
                ExtractAttributeFilter('base_form')
            ]  # 名詞>・形容詞・副詞・動詞の原型のみ
        )

        # 抽出された単語をスペースで連結
        # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。
        corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences]

        # 連結したcorpusを再度tinysegmenterでトークナイズさせる
        parser = PlaintextParser.from_string(''.join(corpus),
                                             Tokenizer('japanese'))

        # LexRankで要約を2文抽出
        summarizer = LexRankSummarizer()
        summarizer.stop_words = [' ']  # スペースも1単語として認識されるため、ストップワードにすることで除外>する
        summary = summarizer(document=parser.document,
                             sentences_count=sentences_count)

        return sentences, corpus, summary
예제 #6
0
 def test_analyze(self):
     char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(u'蛇の目', u'janome')]
     tokenizer = Tokenizer()
     token_filters = [CompoundNounFilter(), POSStopFilter([u'記号', u'助詞']), LowerCaseFilter(), ExtractAttributeFilter('surface')]
     a = Analyzer(char_filters, tokenizer, token_filters)
     tokens = a.analyze(u'蛇の目はPure Pythonな形態素解析器です。')
     self.assertEqual([u'janome', u'pure', u'python', u'な', u'形態素解析器', u'です'], list(tokens))
예제 #7
0
def summarize(text):
    sentences = [t for t in text.split('\n')]
    analyzer = Analyzer(
        [
            UnicodeNormalizeCharFilter(),
            RegexReplaceCharFilter(r'[(\)「」、。]', ' ')
        ],  # ()「」、。は全てスペースに置き換える
        JanomeTokenizer(),
        [
            POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
            ExtractAttributeFilter('base_form')
        ]  # 名詞・形容詞・副詞・動詞の原型のみ
    )

    corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences]

    parser = PlaintextParser.from_string(''.join(corpus),
                                         Tokenizer('japanese'))

    summarizer = LexRankSummarizer()
    summarizer.stop_words = [' ']

    summary = summarizer(document=parser.document, sentences_count=3)

    x = ""

    for sentence in summary:

        x += sentences[corpus.index(sentence.__str__())]

    return x
예제 #8
0
def get_words_list(sentence):
    """
    get list of name words
    :param sentence: string
    :return: list of string
    """
    words_list = []
    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter('<.*?>', '')
    ]

    token_filters = [
        POSKeepFilter(['名詞']),
        LowerCaseFilter(),
        ExtractAttributeFilter('surface')
    ]

    a = Analyzer(char_filters=char_filters, token_filters=token_filters)

    for token in a.analyze(sentence):
        words_list.append(token)

    # japanese words can be split in 2 different words
    if len(words_list) == 4:
        words_list[0] = words_list[0] + words_list[1]
        words_list.pop(1)

    return words_list
예제 #9
0
 def _tokenize_elements(self, replace_word, pos_filter):
   
   tokenizer = Tokenizer()
   char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(replace_word, "")]
   token_filters = [LowerCaseFilter(),  POSStopFilter(pos_filter)]
   t = Analyzer(char_filters, tokenizer, token_filters)
   return [list(t.analyze(text)) for text in self._text]
예제 #10
0
def fn_start_document_summarize(text):  
    # 形態素解析(単語単位に分割する)
    tokenizer = JanomeTokenizer('japanese')
    char_filters=[UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ')]
    token_filters=[POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form')]
    
    analyzer = Analyzer(
        char_filters=char_filters,
        tokenizer=tokenizer,
        token_filters=token_filters
    )
 
    corpus = [' '.join(analyzer.analyze(sentence)) + u'。' for sentence in text]
    #print(corpus, len(corpus))
    
    # 文書要約処理実行    
    parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese'))
    
    # LexRankで要約を原文書の3割程度抽出
    summarizer = LexRankSummarizer()
    summarizer.stop_words = [' ']
    
    # 文書の重要なポイントは2割から3割といわれている?ので、それを参考にsentences_countを設定する。
    N = 3

    summary = summarizer(document=parser.document, sentences_count = N if len(corpus) < 100 else int(len(corpus)/100))
    #summary = summarizer(document=parser.document, sentences_count=1)
    
    str = ''
    for sentence in summary:
        str += (text[corpus.index(sentence.__str__())])
    return str
예제 #11
0
    def __init__(self):

        # 日本語が使えるように日本語フォントの設定
        fpath = 'C:\Windows\Fonts\SourceHanCodeJP-Regular.otf'

        # 文字フィルタ
        char_filters = [
            UnicodeNormalizeCharFilter(),
            RegexReplaceCharFilter('\(', ''),
            RegexReplaceCharFilter('\)', '')
        ]
        # トークンフィルタ
        token_filters = [
            CompoundNounFilter(),
            POSKeepFilter(['名詞', '動詞', '形容詞', '副詞'])
        ]

        # 形態素解析オブジェクト生成
        tokenizer = Tokenizer('sipudic.csv',
                              udic_type='simpledic',
                              udic_enc='utf8')
        self.__analyzer = Analyzer(char_filters, tokenizer, token_filters)

        # ワードクラウトオブジェクト生成
        self.__wdcObj = WordCloud(background_color='white',
                                  font_path=fpath,
                                  width=900,
                                  height=500,
                                  stopwords=set(stop_words))
예제 #12
0
 def __init__(self):
     self.nlp = spacy.load('ja_ginza')
     self.analyzer = Analyzer(
         [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ')],  # ()「」、。は全てスペースに置き換える
         JanomeTokenizer(),
         [POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form')]  # 名詞・形容詞・副詞・動詞の原型のみ
     )
예제 #13
0
def get_corpus_list(sentence_list):
    """文字列を格納したリストを受け取り、形態素ごとに1スペースで区切った文字列に変換して返す関数

    Args:
    sentence_list: list, 
    偉人1人につき、該当するテキストファイル全てから読み込んだ文字列の集合を1要素とするリスト

    Returns:
    corpus_list: list
    引数のリストの各要素において、形態素ごとに1スペースで区切った文字列に変換したリスト
    TODO 本来は、corpusで文章の集合という意味なので、"corpus_list"ではなくcorpusとしたいが、可読性を重視
    """

    # 形態素に分解するAnalyzerをインスタンス化
    # TODO tokenのうち、取り出す品詞を引数で指定して、改良する
    a = Analyzer()
    """Analyzer(char_Filters, tokenizer, token_filters)
    CharFilter のリスト,初期化済み Tokenizer オブジェクト,TokenFilter のリストを指定"""

    # 形態素を(重複含め)格納するリスト
    corpus_list = []
    for sentence in sentence_list:

        corpus = []
        for token in a.analyze(sentence):
            corpus.append(token.surface)

        # joinでスペースに区切って大きな1文にしたあとリストに追加
        corpus = " ".join(corpus)
        corpus_list.append(corpus)

    return corpus_list
예제 #14
0
def get_words_list(sentence):
    """
    文中の名詞を抽出する。
    :param sentence: string
    :return: list of string
    """
    result_list = []

    char_filters = [
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter('<.*?>', '')
    ]

    token_filters = [
        POSKeepFilter(['名詞', '動詞', '助動詞']),
        LowerCaseFilter(),
        ExtractAttributeFilter('surface')
    ]  # 名詞のみを抽出。

    a = Analyzer(char_filters=char_filters, token_filters=token_filters)

    sentence = sentence.replace('.', '')
    for token in a.analyze(sentence):
        result_list.append(token)

    stop_words = [
        "あそこ", "あっ", "あの", "あのかた", "あの人", "あり", "あります", "ある", "あれ", "い", "いう",
        "います", "いる", "う", "うち", "え", "お", "および", "おり", "おります", "か", "かつて",
        "から", "が", "き", "ここ", "こちら", "こと", "この", "これ", "これら", "さ", "さらに", "し",
        "しかし", "する", "ず", "せ", "せる", "そこ", "そして", "その", "その他", "その後", "それ",
        "それぞれ", "それで", "た", "ただし", "たち", "ため", "たり", "だ", "だっ", "だれ", "つ", "て",
        "で", "でき", "できる", "です", "では", "でも", "と", "という", "といった", "とき", "ところ",
        "として", "とともに", "とも", "と共に", "どこ", "どの", "な", "ない", "なお", "なかっ", "ながら",
        "なく", "なっ", "など", "なに", "なら", "なり", "なる", "なん", "に", "において", "における",
        "について", "にて", "によって", "により", "による", "に対して", "に対する", "に関する", "の", "ので",
        "のみ", "は", "ば", "へ", "ほか", "ほとんど", "ほど", "ます", "また", "または", "まで", "も",
        "もの", "ものの", "や", "よう", "より", "ら", "られ", "られる", "れ", "れる", "を", "ん",
        "何", "及び", "彼", "彼女", "我々", "特に", "私", "私達", "貴方", "貴方方"
    ]
    for word in stop_words:
        if word in result_list:
            result_list.remove(word)
        # sentence = sentence.replace(word, '')
    t = Tokenizer()

    # create the ist of tuple
    word_part_of_speech = [('', '')]
    word_kinds = [('名詞', '名詞'),
                  ('動詞', '助動詞')]  # word_assembler()で引数になる単語の種類のリストを作成する。
    # iterate over the tokenize sentence and search for 2 'meishi' who would be following each other
    for token in t.tokenize(sentence):
        for word_kind in word_kinds:  # word_kindsの要素数だけword_assemler()を実行する。
            word_assembler(word_part_of_speech, token, result_list,
                           word_kind)  # 単語の結合
        word_part_of_speech = (token.part_of_speech, token.surface
                               )  # word_kindsの条件で結合された単語を含むリスト

    return result_list
예제 #15
0
def hello_world():
    input = request.json
    s = input['text']
    a = Analyzer(token_filters=[POSKeepFilter(['名詞', '動詞', '副詞', '形容詞'])])
    t = Tokenizer()
    res = []
    for token in a.analyze(s):
        res.append(str(token))
    return jsonify(res)
예제 #16
0
def get_words(string, keep_pos=None):
    filters = []
    if keep_pos is None:
        filters.append(POSStopFilter(['記号']))  # 記号を除外
    else:
        filters.append(POSKeepFilter(keep_pos))  # 指定品詞を抽出
    filters.append(ExtractAttributeFilter('surface'))
    a = Analyzer(token_filters=filters)  # 後処理を指定
    return list(a.analyze(string))
예제 #17
0
 def analyze(self, text):
     """文字列textを形態素解析し、[(surface, parts)]の形にして返す。"""
     char_filters = [UnicodeNormalizeCharFilter()]
     tokenizer = Tokenizer()
     tokens = tokenizer.tokenize(text)
     token_filters = [CompoundNounFilter(), POSStopFilter(['記号']), LowerCaseFilter()]
     a = Analyzer(char_filters, tokenizer, token_filters)
     data = [(token.surface, token.part_of_speech) for token in a.analyze(text)]
     return data
예제 #18
0
 def extract_noun(text):
     t = Tokenizer()
     noun_list = []
     token_filters = [CompoundNounFilter()]
     analizer = Analyzer([], t, token_filters)
     for token in analizer.analyze(text):
         if token.part_of_speech.startswith(
                 "名詞,固有名詞") or token.part_of_speech.startswith("名詞,複合"):
             noun_list.append(token.base_form)
     return list(set(noun_list))
예제 #19
0
def pos_filter(text):

    tokenizer = Tokenizer()
    token_filters = [POSKeepFilter(['名詞', '動詞', '形容詞'])]
    analysis = Analyzer(tokenizer=tokenizer, token_filters=token_filters)

    token = analysis.analyze(text)
    word_list = [word.surface for word in token]

    return concat_str(word_list)
예제 #20
0
def make_tf_idf_result(debug, input_sentence):

    # make 字句解析機
    tokenizer = Tokenizer()
    token_filters = [POSStopFilter(['記号', '助詞', '助動詞', '動詞', '接続詞'])]
    analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters)

    # 名詞の抽出
    file_path = "./all_sentence/all_sentence_0.txt"
    sentence_list = []
    word_list = []

    with open(file_path, encoding='utf-8') as f:
        sentence_list = f.readlines()

    if (not debug):
        sentence_list = change_sentence(sentence_list, input_sentence)

    for i in range(0, 201):
        tokens = analyzer.analyze(sentence_list[i])
        sentences_tmp = []
        for t in tokens:
            sentences_tmp.append(t.surface)

        word_list.append(" ".join(sentences_tmp))

    # nparray 化
    np_word_list = np.array(word_list)

    # ベクトル化する機器生成
    vec_tfidf = TfidfVectorizer()

    # ベクトル化
    X = vec_tfidf.fit_transform(np_word_list)

    # tf-idf と 名詞 を辞書として処理
    set_word_and_tf_idf = {}
    words = vec_tfidf.get_feature_names()
    for i, vec in zip(range(0, 1), X.toarray()):
        for w_id, tfidf in sorted(enumerate(vec),
                                  key=lambda x: x[1],
                                  reverse=True):
            word = words[w_id]
            set_word_and_tf_idf[word] = tfidf

    result_list = []

    for key in set_word_and_tf_idf.keys():
        if (set_word_and_tf_idf[key] > 0):
            print(key + ": " + str(set_word_and_tf_idf[key]))
            result_list.append({key: set_word_and_tf_idf[key]})
        else:
            break

    return result_list
예제 #21
0
 def text2wordinfolist(self, text):
     stopwords = '。'
     # text = u'の12時から朝6時まで。朝6時でも、お給料はいいんですよ。'
     # char_filters = [UnicodeNormalizeCharFilter()]
     # print(text)
     self.proessbar_signal.emit(20, 100, 1, '正在加载自然语言分析库')
     text = re.sub('\W+', '。', text)
     tokenizer = Tokenizer()
     self.proessbar_signal.emit(40, 100, 1, '正在加载自然语言分析库')
     token_filters = [CompoundNounFilter(), LowerCaseFilter()]
     self.proessbar_signal.emit(80, 100, 1, '正在加载自然语言分析库')
     analyzer = Analyzer([], tokenizer, token_filters)
     self.proessbar_signal.emit(99, 100, 1, '正在加载自然语言分析库')
     word_list = []
     all_word_lists = []
     progress = 0
     for token in analyzer.analyze(text):
         # self.proessbar_signal.emit(64,100,1,token.surface)
         word_list.append(token.surface)
         word_list.append(token.part_of_speech)
         word_list.append(token.infl_type)
         word_list.append(token.infl_form)
         word_list.append(token.base_form)
         word_list.append(token.reading)
         word_list.append(token.phonetic)
         all_word_lists.append(word_list)
         print(word_list)
         word_list = []
         progress = progress + 1
         self.proessbar_signal.emit(random.randint(61, 80), 100, 1,
                                    '正在处理词语 [ ' + token.surface + ' ]  ')
     d = {}
     word_list = []
     for key in all_word_lists:
         d[key[0]] = d.get(key[0], 0) + 1
     l1 = (sorted(d.items(), key=lambda x: x[1], reverse=True))
     l2 = [w for w in l1 if w[0] not in stopwords]
     line = 0
     for l in l2:
         for wordinfo in all_word_lists:
             # print(wordinfo)
             # print(len(wordinfo))
             if l[0] == wordinfo[0]:
                 # print(line,wordinfo[0],str(l[1]),wordinfo[1],wordinfo[2],wordinfo[3],wordinfo[4],wordinfo[5],wordinfo[6])
                 # self.words_signal.emit(line,wordinfo[0],str(l[1]),wordinfo[1],wordinfo[2],wordinfo[3],wordinfo[4],wordinfo[5],trans(wordinfo[6]))
                 print(wordinfo)
                 self.words_signal.emit(line, str(l[1]), wordinfo[0],
                                        trans(wordinfo[5]), wordinfo[4],
                                        wordinfo[1], wordinfo[2],
                                        wordinfo[3], wordinfo[5])
                 line = line + 1
                 break
         self.proessbar_signal.emit(line, len(l2), 2, '词频信息整理中')
     self.proessbar_signal.emit(len(l2), len(l2), 2, '词频信息整理中')
예제 #22
0
def exec_analyser(text):
	
	#解析 名詞だけ取り出す
	token_filters = [POSKeepFilter(['名詞']), TokenCountFilter(sorted=True)]
	a = Analyzer(token_filters=token_filters)
	col_list = []
	value_list = []
	for k, v in a.analyze(text):
		col_list.append(k)
		value_list.append(v)
	return (col_list,value_list)	
예제 #23
0
def main():
    char_filters = [UnicodeNormalizeCharFilter()]
    tokenizer = Tokenizer()
    token_filters = [CompoundNounFilter(), LowerCaseFilter()]
    analyzer = Analyzer(char_filters=char_filters,
                        tokenizer=tokenizer,
                        token_filters=token_filters)

    text = '私は、渋谷ストリームでランチを食べる。'
    for token in analyzer.analyze(text):
        print(token)
    print('DONE')
예제 #24
0
 def setUp(self):
     #aliases = get_word_aliases()
     char_filters = [
         UnicodeNormalizeCharFilter(),
         RegexReplaceCharFilter('&[^&]+;', '')
     ]
     tokenizer = Tokenizer(mmap=True)
     token_filters = [
         FootballCompoundNounFilter(),
         FootballNounFilter(),
         POSKeepFilter('名詞')
     ]
     self.analyzer = Analyzer(char_filters, tokenizer, token_filters)
예제 #25
0
    def tokenize(self, doc):
        """tokenize document

        Args:
            doc (str): row document

        Returns:
            list: tokenized words
        """
        tokenizer = Tokenizer()
        analyzer = Analyzer(self.char_filters(), tokenizer,
                            self.token_filters())
        return [token.surface for token in analyzer.analyze(doc)]
예제 #26
0
 def set_analyzer(self):
     # 形態素解析器を作る
     self.analyzer = Analyzer(
         [
             UnicodeNormalizeCharFilter(),
             RegexReplaceCharFilter(r'[(\)「」、。]', ' ')
         ],  # ()「」、。は全てスペースに置き換える
         JanomeTokenizer(),
         [
             POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
             ExtractAttributeFilter('base_form')
         ]  # 名詞・形容詞・副詞・動詞の原型のみ
     )
def extraction(texts):
    a = Analyzer(token_filters=[CompoundNounFilter()])
    words = []
    for text in texts:
        tokens = a.analyze(text)
        for t in tokens:
            pos = t.part_of_speech.split(',')[0]
            if pos in {'名詞', '形容詞'}:
                t = t.base_form   # 基本形
                # t = t.surface   # 表層形
                t = str_replace(t)
                if t[:4] != 'http':
                    words.append(t)
    return ' '.join(words)
예제 #28
0
def morphological_analysis(text):
    # 文字列に関するフィルタ。文字コード変換ややHTMLタグの排除などできる
    char_filter = []
    # 実際に形態素解析を行うインスタンス?
    t = Tokenizer()
    # 取得した要素に対してフィルタをかけるフィルタ
    token_filter = [POSKeepFilter(['形容詞', '名詞', '動詞'])]
    # token_filter =[]
    # char_filter,tokenizer,token_filterを合体させる
    analize = Analyzer(char_filter, t, token_filter)
    # 取得
    # for token in analize.analyze(text):
    #   print(token)
    return analize.analyze(text)
예제 #29
0
def analysis(request):
    word_list = []
    text = "なんで空は青いの?"
    char_filters = [UnicodeNormalizeCharFilter()]
    tokenizer = Tokenizer()
    token_filters = [POSKeepFilter(['名詞', '形容詞'])]
    analyzer = Analyzer(char_filters=char_filters,
                        tokenizer=tokenizer, token_filters=token_filters)
    for token in analyzer.analyze(text):
        word_list.append(token.surface)
    template = loader.get_template('prototype/analysis.html')
    context = {
        'word_list': word_list,
    }
    return HttpResponse(template.render(context, request))
예제 #30
0
def MorphologicalAnalysis(texts):
    a = Analyzer(token_filters=[CompoundNounFilter()])
    words_count = defaultdict(int)
    words = []

    for text in texts:
        tokens = a.analyze(text)
        for token in tokens:
            pos = token.part_of_speech.split(',')[0]
            if pos in ['名詞', '形容詞']:
                words_count[token.base_form] += 1
                words.append(
                    token.base_form.strip("@").strip("#").strip(":").strip(
                        "\""))
    return words_count, words
예제 #31
0
# -*- coding: utf-8 -*-
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *

import logging
logging.basicConfig(level='INFO')

print(u'Analyzer example:')
text = u'蛇の目はPure Pythonな形態素解析器です。'
char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(u'蛇の目', u'janome')]
tokenizer = Tokenizer()
token_filters = [CompoundNounFilter(), LowerCaseFilter()]
a = Analyzer(char_filters, tokenizer, token_filters)
for token in a.analyze(text):
    print(token)

print('')
print(u'Analyzer example: Count nouns with POSKeepFilter and TokenCountFilter')
text = u'すもももももももものうち'
token_filters = [POSKeepFilter(u'名詞'), TokenCountFilter()]
a = Analyzer(token_filters=token_filters)
for k, v in a.analyze(text):
    print('%s: %d' % (k, v))