def fetch(): if request.method == "POST": if request.form["InputText"]: results_noun = [] results_verb = [] text = request.form["InputText"] token_filters = [ POSKeepFilter(['名詞']), TokenCountFilter(sorted=True) ] a = Analyzer(token_filters=token_filters) for k, v in a.analyze(text): results_noun.append([k, v]) delete_extra(results_noun) token_filters = [ POSKeepFilter(['動詞']), TokenCountFilter(sorted=True) ] a = Analyzer(token_filters=token_filters) for k, v in a.analyze(text): results_verb.append([k, v]) delete_extra(results_verb) return render_template("result.html", results_noun=results_noun, results_verb=results_verb) else: flash("テキストが入力されていません") return render_template("top.html")
def __init__(self): # 日本語が使えるように日本語フォントの設定 fpath = 'C:\Windows\Fonts\SourceHanCodeJP-Regular.otf' # 文字フィルタ char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('\(', ''), RegexReplaceCharFilter('\)', '') ] # トークンフィルタ token_filters = [ CompoundNounFilter(), POSKeepFilter(['名詞', '動詞', '形容詞', '副詞']) ] # 形態素解析オブジェクト生成 tokenizer = Tokenizer('sipudic.csv', udic_type='simpledic', udic_enc='utf8') self.__analyzer = Analyzer(char_filters, tokenizer, token_filters) # ワードクラウトオブジェクト生成 self.__wdcObj = WordCloud(background_color='white', font_path=fpath, width=900, height=500, stopwords=set(stop_words))
def __init__(self, save_dir="result/"): self.save_dir = save_dir # 解析器 token_filters = [POSKeepFilter(['名詞']), NumericReplaceFilter(), TokenCountFilter()] char_filters = [UnicodeNormalizeCharFilter()] self.ana = Analyzer(char_filters=char_filters, token_filters=token_filters) # 個人ごとの解析結果を入れる self.noun = {} self.emoji = {} # サニタイズフィルタ self.filter = [] with codecs.open(FILTER_FILE, "r", "utf-8") as rf: for line in rf.readlines(): self.filter.append(line.rstrip()) # Word Cloud self.wc = wordcloud.WordCloud( background_color="white", max_words=self.wc_max_words, max_font_size=550, width=1920, height=1080, # mask=mask, font_path=FONT_PATH)
def test_analyze(self): char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(u'蛇の目', u'janome')] tokenizer = Tokenizer() token_filters = [CompoundNounFilter(), POSStopFilter([u'記号', u'助詞']), LowerCaseFilter(), ExtractAttributeFilter('surface')] a = Analyzer(char_filters, tokenizer, token_filters) tokens = a.analyze(u'蛇の目はPure Pythonな形態素解析器です。') self.assertEqual([u'janome', u'pure', u'python', u'な', u'形態素解析器', u'です'], list(tokens))
def get_corpus_list(sentence_list): """文字列を格納したリストを受け取り、形態素ごとに1スペースで区切った文字列に変換して返す関数 Args: sentence_list: list, 偉人1人につき、該当するテキストファイル全てから読み込んだ文字列の集合を1要素とするリスト Returns: corpus_list: list 引数のリストの各要素において、形態素ごとに1スペースで区切った文字列に変換したリスト TODO 本来は、corpusで文章の集合という意味なので、"corpus_list"ではなくcorpusとしたいが、可読性を重視 """ # 形態素に分解するAnalyzerをインスタンス化 # TODO tokenのうち、取り出す品詞を引数で指定して、改良する a = Analyzer() """Analyzer(char_Filters, tokenizer, token_filters) CharFilter のリスト,初期化済み Tokenizer オブジェクト,TokenFilter のリストを指定""" # 形態素を(重複含め)格納するリスト corpus_list = [] for sentence in sentence_list: corpus = [] for token in a.analyze(sentence): corpus.append(token.surface) # joinでスペースに区切って大きな1文にしたあとリストに追加 corpus = " ".join(corpus) corpus_list.append(corpus) return corpus_list
def janome_document_summarize(document): # 形態素解析(単語単位に分割する) analyzer = Analyzer(char_filters=[ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], tokenizer=JanomeTokenizer(), token_filters=[ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) text = re.findall("[^。]+。?", document.replace('\n', '')) corpus = [' '.join(analyzer.analyze(sentence)) + u'。' for sentence in text] parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) summarizer = LexRankSummarizer() summarizer.stop_words = [' ', '。', '\n'] N = int(len(corpus) / 10 * 3) if N <= 0: N = 3 summary = summarizer(document=parser.document, sentences_count=N) rst = '' print('\n要約:') for sentence in summary: print(text[corpus.index(sentence.__str__())]) rst += text[corpus.index(sentence.__str__())] return summary, rst
def __init__(self): self.nlp = spacy.load('ja_ginza') self.analyzer = Analyzer( [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ')], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form')] # 名詞・形容詞・副詞・動詞の原型のみ )
def tense_analyze(self, text, sentences_count): # 1行1文となっているため、改行コードで分離 # sentences = [t for t in text.split('\n')] sentences = [t for t in text.split('。')] # 形態素解析器を作る analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全>てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞>・形容詞・副詞・動詞の原型のみ ) # 抽出された単語をスペースで連結 # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。 corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] # 連結したcorpusを再度tinysegmenterでトークナイズさせる parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を2文抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # スペースも1単語として認識されるため、ストップワードにすることで除外>する summary = summarizer(document=parser.document, sentences_count=sentences_count) return sentences, corpus, summary
def make_corpus(docs, debug=False): """ 複数の文書からコーパスを作成する @docs 文書のリスト @return トークナイズされた文書のリスト """ docs = list( map( lambda d: list( filter(lambda x: x.strip() != "", re.split("\n|。", d.lower())) ), docs)) docs = [ list(map(lambda x: mojimoji.zen_to_han(x), lines)) for lines in docs ] analyzer = Analyzer([ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)、。「」]', ' ') ], JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) corpus = [ list( itertools.chain.from_iterable( [list(analyzer.analyze(l)) for l in lines])) for lines in docs ] if debug: print("\n".join(corpus)) return corpus
def summarize(text): sentences = [t for t in text.split('\n')] analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ ) corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) summarizer = LexRankSummarizer() summarizer.stop_words = [' '] summary = summarizer(document=parser.document, sentences_count=3) x = "" for sentence in summary: x += sentences[corpus.index(sentence.__str__())] return x
def preprocess(doc, debug=False): """ ドキュメントを引数にとってそれを前処理した上でトークナイズされた文のリストに分割する @param doc 対象のドキュメント @return 前処理されたドキュメントに含まれる文のリスト """ doc = doc.lower() lines = re.split("\n|。", doc) lines = list(filter(lambda x: x != "", map(lambda x: x.strip(), lines))) sentences = copy.deepcopy(lines) lines = list(map(lambda x: mojimoji.zen_to_han(x), lines)) analyzer = Analyzer([ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)、。「」]', ' ') ], JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) corpus = [' '.join(analyzer.analyze(l)) + '。' for l in lines] if debug: print("\n".join(corpus)) return sentences, corpus
def fn_start_document_summarize(text): # 形態素解析(単語単位に分割する) tokenizer = JanomeTokenizer('japanese') char_filters=[UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ')] token_filters=[POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form')] analyzer = Analyzer( char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters ) corpus = [' '.join(analyzer.analyze(sentence)) + u'。' for sentence in text] #print(corpus, len(corpus)) # 文書要約処理実行 parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を原文書の3割程度抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # 文書の重要なポイントは2割から3割といわれている?ので、それを参考にsentences_countを設定する。 N = 3 summary = summarizer(document=parser.document, sentences_count = N if len(corpus) < 100 else int(len(corpus)/100)) #summary = summarizer(document=parser.document, sentences_count=1) str = '' for sentence in summary: str += (text[corpus.index(sentence.__str__())]) return str
def _tokenize_elements(self, replace_word, pos_filter): tokenizer = Tokenizer() char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(replace_word, "")] token_filters = [LowerCaseFilter(), POSStopFilter(pos_filter)] t = Analyzer(char_filters, tokenizer, token_filters) return [list(t.analyze(text)) for text in self._text]
def get_words_list(sentence): """ get list of name words :param sentence: string :return: list of string """ words_list = [] char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('<.*?>', '') ] token_filters = [ POSKeepFilter(['名詞']), LowerCaseFilter(), ExtractAttributeFilter('surface') ] a = Analyzer(char_filters=char_filters, token_filters=token_filters) for token in a.analyze(sentence): words_list.append(token) # japanese words can be split in 2 different words if len(words_list) == 4: words_list[0] = words_list[0] + words_list[1] words_list.pop(1) return words_list
def test_analyzer_default(self): a = Analyzer() self.assertIsNotNone(a.char_filters) self.assertTrue(len(a.char_filters) == 0) self.assertIsNotNone(a.tokenizer) self.assertIsInstance(a.tokenizer, Tokenizer) self.assertIsNotNone(a.token_filters) self.assertTrue(len(a.token_filters) == 0)
def __init__(self, corpus): docs = [read_document(x) for x in corpus] self.obj = list(zip(docs, corpus)) self.current = 0 tokenizer = Tokenizer(mmap=True) char_filters = [UnicodeNormalizeCharFilter()] token_filters = [POSStopFilter(['記号', '助詞']), LowerCaseFilter()] self.tokenizer = Analyzer(char_filters, tokenizer, token_filters)
def get_words_list(sentence): """ 文中の名詞を抽出する。 :param sentence: string :return: list of string """ result_list = [] char_filters = [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter('<.*?>', '') ] token_filters = [ POSKeepFilter(['名詞', '動詞', '助動詞']), LowerCaseFilter(), ExtractAttributeFilter('surface') ] # 名詞のみを抽出。 a = Analyzer(char_filters=char_filters, token_filters=token_filters) sentence = sentence.replace('.', '') for token in a.analyze(sentence): result_list.append(token) stop_words = [ "あそこ", "あっ", "あの", "あのかた", "あの人", "あり", "あります", "ある", "あれ", "い", "いう", "います", "いる", "う", "うち", "え", "お", "および", "おり", "おります", "か", "かつて", "から", "が", "き", "ここ", "こちら", "こと", "この", "これ", "これら", "さ", "さらに", "し", "しかし", "する", "ず", "せ", "せる", "そこ", "そして", "その", "その他", "その後", "それ", "それぞれ", "それで", "た", "ただし", "たち", "ため", "たり", "だ", "だっ", "だれ", "つ", "て", "で", "でき", "できる", "です", "では", "でも", "と", "という", "といった", "とき", "ところ", "として", "とともに", "とも", "と共に", "どこ", "どの", "な", "ない", "なお", "なかっ", "ながら", "なく", "なっ", "など", "なに", "なら", "なり", "なる", "なん", "に", "において", "における", "について", "にて", "によって", "により", "による", "に対して", "に対する", "に関する", "の", "ので", "のみ", "は", "ば", "へ", "ほか", "ほとんど", "ほど", "ます", "また", "または", "まで", "も", "もの", "ものの", "や", "よう", "より", "ら", "られ", "られる", "れ", "れる", "を", "ん", "何", "及び", "彼", "彼女", "我々", "特に", "私", "私達", "貴方", "貴方方" ] for word in stop_words: if word in result_list: result_list.remove(word) # sentence = sentence.replace(word, '') t = Tokenizer() # create the ist of tuple word_part_of_speech = [('', '')] word_kinds = [('名詞', '名詞'), ('動詞', '助動詞')] # word_assembler()で引数になる単語の種類のリストを作成する。 # iterate over the tokenize sentence and search for 2 'meishi' who would be following each other for token in t.tokenize(sentence): for word_kind in word_kinds: # word_kindsの要素数だけword_assemler()を実行する。 word_assembler(word_part_of_speech, token, result_list, word_kind) # 単語の結合 word_part_of_speech = (token.part_of_speech, token.surface ) # word_kindsの条件で結合された単語を含むリスト return result_list
def get_words(string, keep_pos=None): filters = [] if keep_pos is None: filters.append(POSStopFilter(['記号'])) # 記号を除外 else: filters.append(POSKeepFilter(keep_pos)) # 指定品詞を抽出 filters.append(ExtractAttributeFilter('surface')) a = Analyzer(token_filters=filters) # 後処理を指定 return list(a.analyze(string))
def analyze(self, text): """文字列textを形態素解析し、[(surface, parts)]の形にして返す。""" char_filters = [UnicodeNormalizeCharFilter()] tokenizer = Tokenizer() tokens = tokenizer.tokenize(text) token_filters = [CompoundNounFilter(), POSStopFilter(['記号']), LowerCaseFilter()] a = Analyzer(char_filters, tokenizer, token_filters) data = [(token.surface, token.part_of_speech) for token in a.analyze(text)] return data
def hello_world(): input = request.json s = input['text'] a = Analyzer(token_filters=[POSKeepFilter(['名詞', '動詞', '副詞', '形容詞'])]) t = Tokenizer() res = [] for token in a.analyze(s): res.append(str(token)) return jsonify(res)
def extract_noun(text): t = Tokenizer() noun_list = [] token_filters = [CompoundNounFilter()] analizer = Analyzer([], t, token_filters) for token in analizer.analyze(text): if token.part_of_speech.startswith( "名詞,固有名詞") or token.part_of_speech.startswith("名詞,複合"): noun_list.append(token.base_form) return list(set(noun_list))
def generate_analyzer(self) -> Analyzer: if self._user_dict_filepath: tokenizer = Tokenizer(self._user_dict_filepath, udic_type="simpledic", udic_enc="utf8") else: tokenizer = Tokenizer() return Analyzer(char_filters=self.char_filters, tokenizer=tokenizer, token_filters=self.token_filters)
def pos_filter(text): tokenizer = Tokenizer() token_filters = [POSKeepFilter(['名詞', '動詞', '形容詞'])] analysis = Analyzer(tokenizer=tokenizer, token_filters=token_filters) token = analysis.analyze(text) word_list = [word.surface for word in token] return concat_str(word_list)
def make_tf_idf_result(debug, input_sentence): # make 字句解析機 tokenizer = Tokenizer() token_filters = [POSStopFilter(['記号', '助詞', '助動詞', '動詞', '接続詞'])] analyzer = Analyzer(tokenizer=tokenizer, token_filters=token_filters) # 名詞の抽出 file_path = "./all_sentence/all_sentence_0.txt" sentence_list = [] word_list = [] with open(file_path, encoding='utf-8') as f: sentence_list = f.readlines() if (not debug): sentence_list = change_sentence(sentence_list, input_sentence) for i in range(0, 201): tokens = analyzer.analyze(sentence_list[i]) sentences_tmp = [] for t in tokens: sentences_tmp.append(t.surface) word_list.append(" ".join(sentences_tmp)) # nparray 化 np_word_list = np.array(word_list) # ベクトル化する機器生成 vec_tfidf = TfidfVectorizer() # ベクトル化 X = vec_tfidf.fit_transform(np_word_list) # tf-idf と 名詞 を辞書として処理 set_word_and_tf_idf = {} words = vec_tfidf.get_feature_names() for i, vec in zip(range(0, 1), X.toarray()): for w_id, tfidf in sorted(enumerate(vec), key=lambda x: x[1], reverse=True): word = words[w_id] set_word_and_tf_idf[word] = tfidf result_list = [] for key in set_word_and_tf_idf.keys(): if (set_word_and_tf_idf[key] > 0): print(key + ": " + str(set_word_and_tf_idf[key])) result_list.append({key: set_word_and_tf_idf[key]}) else: break return result_list
def exec_analyser(text): #解析 名詞だけ取り出す token_filters = [POSKeepFilter(['名詞']), TokenCountFilter(sorted=True)] a = Analyzer(token_filters=token_filters) col_list = [] value_list = [] for k, v in a.analyze(text): col_list.append(k) value_list.append(v) return (col_list,value_list)
def text2wordinfolist(self, text): stopwords = '。' # text = u'の12時から朝6時まで。朝6時でも、お給料はいいんですよ。' # char_filters = [UnicodeNormalizeCharFilter()] # print(text) self.proessbar_signal.emit(20, 100, 1, '正在加载自然语言分析库') text = re.sub('\W+', '。', text) tokenizer = Tokenizer() self.proessbar_signal.emit(40, 100, 1, '正在加载自然语言分析库') token_filters = [CompoundNounFilter(), LowerCaseFilter()] self.proessbar_signal.emit(80, 100, 1, '正在加载自然语言分析库') analyzer = Analyzer([], tokenizer, token_filters) self.proessbar_signal.emit(99, 100, 1, '正在加载自然语言分析库') word_list = [] all_word_lists = [] progress = 0 for token in analyzer.analyze(text): # self.proessbar_signal.emit(64,100,1,token.surface) word_list.append(token.surface) word_list.append(token.part_of_speech) word_list.append(token.infl_type) word_list.append(token.infl_form) word_list.append(token.base_form) word_list.append(token.reading) word_list.append(token.phonetic) all_word_lists.append(word_list) print(word_list) word_list = [] progress = progress + 1 self.proessbar_signal.emit(random.randint(61, 80), 100, 1, '正在处理词语 [ ' + token.surface + ' ] ') d = {} word_list = [] for key in all_word_lists: d[key[0]] = d.get(key[0], 0) + 1 l1 = (sorted(d.items(), key=lambda x: x[1], reverse=True)) l2 = [w for w in l1 if w[0] not in stopwords] line = 0 for l in l2: for wordinfo in all_word_lists: # print(wordinfo) # print(len(wordinfo)) if l[0] == wordinfo[0]: # print(line,wordinfo[0],str(l[1]),wordinfo[1],wordinfo[2],wordinfo[3],wordinfo[4],wordinfo[5],wordinfo[6]) # self.words_signal.emit(line,wordinfo[0],str(l[1]),wordinfo[1],wordinfo[2],wordinfo[3],wordinfo[4],wordinfo[5],trans(wordinfo[6])) print(wordinfo) self.words_signal.emit(line, str(l[1]), wordinfo[0], trans(wordinfo[5]), wordinfo[4], wordinfo[1], wordinfo[2], wordinfo[3], wordinfo[5]) line = line + 1 break self.proessbar_signal.emit(line, len(l2), 2, '词频信息整理中') self.proessbar_signal.emit(len(l2), len(l2), 2, '词频信息整理中')
def main(): char_filters = [UnicodeNormalizeCharFilter()] tokenizer = Tokenizer() token_filters = [CompoundNounFilter(), LowerCaseFilter()] analyzer = Analyzer(char_filters=char_filters, tokenizer=tokenizer, token_filters=token_filters) text = '私は、渋谷ストリームでランチを食べる。' for token in analyzer.analyze(text): print(token) print('DONE')
def tokenize(self, doc): """tokenize document Args: doc (str): row document Returns: list: tokenized words """ tokenizer = Tokenizer() analyzer = Analyzer(self.char_filters(), tokenizer, self.token_filters()) return [token.surface for token in analyzer.analyze(doc)]
def test_analyzer_custom(self): char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(u'\s+', u'')] tokenizer = Tokenizer(mmap=True) token_filters = [CompoundNounFilter(), POSStopFilter([u'記号',u'助詞']), LowerCaseFilter()] a = Analyzer(char_filters, tokenizer, token_filters) self.assertTrue(len(a.char_filters) == 2) self.assertIsInstance(a.char_filters[0], UnicodeNormalizeCharFilter) self.assertIsInstance(a.char_filters[1], RegexReplaceCharFilter) self.assertIsInstance(a.tokenizer.sys_dic, MMapSystemDictionary) self.assertTrue(len(a.token_filters) == 3) self.assertIsInstance(a.token_filters[0], CompoundNounFilter) self.assertIsInstance(a.token_filters[1], POSStopFilter) self.assertIsInstance(a.token_filters[2], LowerCaseFilter)
def set_analyzer(self): # 形態素解析器を作る self.analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ )