/
text mining.py
256 lines (216 loc) · 11.4 KB
/
text mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 4 12:40:02 2018
@author: A
"""
from sklearn.datasets import load_files
reviews_train = load_files("C:/Users/A/.spyder-py3/aclImdb/train/")
text_train, y_train = reviews_train.data, reviews_train.target
print("text_train type : {}".format(type(text_train)))
print("text_train length : {}".format(len(text_train)))
print("text_train[6] : \n{}".format(text_train[6]))
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]
import numpy as np
print("class sample : {}".format(np.bincount(y_train)))
reviews_test = load_files("C:/Users/A/.spyder-py3/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("text_test length : {}".format(len(text_test)))
print("class sample : {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]
##BOW
bards_words = ["The fool doth think he is wise,",
"but the wise man knows himself to be a fool"]
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(bards_words)
print("word dic size : {}".format(len(vect.vocabulary_)))
print("word dic text : {}".format(vect.vocabulary_))
bag_of_words = vect.transform(bards_words)
print("bow : {}".format(repr(bag_of_words)))
print("bow의 밀집 표현 : \n{}".format(bag_of_words.toarray()))
#리뷰데이터에 적용
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train : \n{}".format(repr(X_train)))
feature_names = vect.get_feature_names()
print("feature count : {}".format(len(feature_names)))
print("first~20 : \n{}".format(feature_names[:20]))#의미없는것들도 포함
print("20010~20030 : \n{}".format(feature_names[20010:20030]))#단복수가 같이 들어감
print("2000n : \n{}".format(feature_names[::2000]))
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("cross val score : {:.2f}".format(np.mean(scores)))
from sklearn.model_selection import GridSearchCV
param_grid = {"C":[0.001, 0.01, 0.1, 1 ,10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("best cross val score : {:.2f}".format(grid.best_score_))
print("best param : {}".format(grid.best_params_))
X_test = vect.transform(text_test)
print("test score : {:.2f}".format(grid.score(X_test, y_test)))
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("min_df X_train : {}".format(repr(X_train)))
feature_names = vect.get_feature_names()
print("first~50 : \n{}".format(feature_names[:50]))
print("20010~20030 : \n{}".format(feature_names[20010:20030]))
print("700n : \n{}".format(feature_names[::700]))
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("best cross val score : {:.2f}".format(grid.best_score_))
##불용어:의미없거나 너무 빈번해 유용하지 않은 단어
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("stopword count : {}".format(len(ENGLISH_STOP_WORDS)))#318개
print("10n stopword : \n{}".format(list(ENGLISH_STOP_WORDS)[::10]))
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)#내장된 영어 불용어
X_train = vect.transform(text_train)
print("stopword without X_train : \n{}".format(repr(X_train)))#27271->26966 305개 감소
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = {"C":[0.001, 0.01, 0.1, 1 ,10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("best cross val score : {:.2f}".format(grid.best_score_))#변화작지만 감소.
#고정불용어는 작은데이터에 적합
##tf-idf : 특정문서 자주나타나는 단어에 가중치
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression())
param_grid = {"logisticregression__C" : [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("best cross val score : {:.2f}".format(grid.best_score_))
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
X_train = vectorizer.transform(text_train)
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
import numpy as np
feature_names = np.array(vectorizer.get_feature_names())
print("low tfidf feature : \n{}".format(feature_names[sorted_by_tfidf[:20]]))
print("hihg tfidf feature : \n{}".format(feature_names[sorted_by_tfidf[-20:]]))
sorted_by_idf = np.argsort(vectorizer.idf_)
print("lowest idf feature : \n{}".format(feature_names[sorted_by_idf[:100]]))
#the, and, of 처럼 자주 쓰는 단어들 등장
#good, bad : 감정분석에서는 중요하겠지만 tfidf에서는 안 중요
##모델 계수
import mglearn
mglearn.tools.visualize_coefficients(grid.best_estimator_.named_steps
["logisticregression"].coef_[0], feature_names,
n_top_features=40)
#음수계수는 부정적 리뷰, 양수계수는 긍정적 리뷰
##여러토큰 bow : 맥락 고려 위함
print("bards_words : \n{}".format(bards_words))
cv = CountVectorizer(ngram_range=(1,1)).fit(bards_words)#토큰 1개짜리만
print("word dic size : {}".format(len(cv.vocabulary_)))
print("word dic : {}".format(cv.get_feature_names()))
cv = CountVectorizer(ngram_range=(2,2)).fit(bards_words)#토큰 2개짜리만
print("word dic size : {}".format(len(cv.vocabulary_)))
print("word dic : {}".format(cv.get_feature_names()))
print("transform data : \n{}".format(cv.transform(bards_words).toarray()))
cv = CountVectorizer(ngram_range=(1,3)).fit(bards_words)#토큰 1~3개
print("word dic size : {}".format(len(cv.vocabulary_)))
print("word dic : {}".format(cv.get_feature_names()))
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression())
param_grid = {"logisticregression__C" : [0.001, 0.01, 0.1, 1, 10, 100],
"tfidfvectorizer__ngram_range" : [(1,1),(1,2),(1,3)]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("best cross val score : {:.2f}".format(grid.best_score_))
print("best param : {}".format(grid.best_params_))
#ngram 정확도 히트맵
import matplotlib.pyplot as plt
scores = grid.cv_results_["mean_test,score"].reshape(-1,3).T
heatmap = mglearn.tools.heatmap(scores, xlabel="C", ylabel="ngram_range",cmap="viridis",
fmt="%.3f",
xticklabels=param_grid["logisticregression__C"],
yticklabels=param_grid["tfidfvectorizer__ngram_range"])
plt.colorbar(heatmap)
#1~3그램 모델에서 가장 중요한 특성
vect = grid.best_estimator_.named_steps["tfidfvectorizer"]
featrue_names = np.array(vect.get_feature_names())
coef = grid.best_estimator_.named_steps["logisticregression"].coef_
mglearn.tools.visualize_coefficients(coef, feature_names, n_top_features=40)
#3그램
mask = np.array([len(feature.split(" ")) for feature in feature_names]) == 3
mglearn.tools.visualize_coefficients(coef.rabel()[mask],feature_names[mask],
n_top_features=40)
##어간추출, 표제어 추출. ##버전문제로 spacy, konlpy 설치가 안됨
import spacy #어간
import nltk #표제어
en_nlp = spacy.load("en")#영어모델
stemmer = nltk.stem.PorterStemmer()
def compare_normalization(doc):
doc_spacy = en_nlp(doc)#spacy로 토큰화
print("표제어 : ")
print([token.lemma_ for token in doc_spacy])
print("어간 : ")
print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])
compare_normalization(u"Our meeting today was worse than yerterday, "
"I'm scared of metting the clients tomorrow.")
import re
regexp = re.compile('(?u)\\b\\w\\w+\\b')
en_nlp = spacy.load('en')
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(
regexp.findall(string))
def custom_tokenizer(document): #커스텀 토큰 분할기
doc_spacy = en_nlp(document, entity=False, parse=False)
return [token.lemma_ for token in doc_spacy]
lemma_vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5)
X_train_lemma = lemma_vect.fit_trainsform(text_train)
print("X_train_lemma.shape : {}".format(X_train_lemma.shape))
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train.shape : {}".format(X_train.shape))
#그리드서치
from sklearn.model_selection import StratifiedShuffleSplit
param_grid = {"C":[0.001, 0.01, 0.1, 1, 10]}
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.99, train_size=0.01, random_state=0)
grid = GridSearchCV(LogisticRegression(), param_grid, cv=cv)
grid.fit(X_train, y_train)
print("best cross val score (기본CV) : {:.2f}".format(grid.best_score_))
grid.fit(X_train_lemma, y_train)
print("best corss val score (표제어) : {:.2f}".format(grid.best_score_))
###########konlpy######
##LDA 잠재 디리클레 할당 : 비지도학습으로 문서를 토픽으로 할당. 군집화의 일종
#자주나오는 단어 제거하는게 좋음 적어도 15%
vect = CountVectorizer(max_features=10000, max_df=0.15)
X = vect.fit_transform(text_train)
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10, learning_method="batch",
max_iter=25, random_state=0)
document_topics = lda.fit_transform(X) #시간 오래 걸림
print("lda.components_.shape : {}".format(lda.components_.shape))
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
sorting=sorting, topics_per_chunk=5, n_words=10)
#할당된 문서의 높은순위 단어 확인
lda100 = LatentDirichletAllocation(n_topics=100, learning_method="batch",
max_iter=25, random_state=0)#토픽 많을수록 복잡하지만 유의함
document_topics100 = lda100.fit_transform(X)
topics = np.array([7,16,24,25,28,36,37,45,51,53,54,63,89,97])
sorting = np.argsort(lda100.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=topics, feature_names=feature_names,
sorting=sorting, topics_per_chunk=7, n_words=20)
#45번이 음악에 관한것인듯
music = np.argsort(document_topics100[:,45])[::-1]
for i in music[:10]:
print(b".".join(text_train[i].split(b".")[:2])+b".\n")
#토픽 가중치
fig, ax = plt.subplits(1, 2, figsize=(10,10))
topic_names = ["{:2} ".format(i) + " ".join(words) #각 토픽을 대표하는 두단어 합치기
for i, words in enumerate(feature_names[sorting[:, :2]])]
for col in [0, 1]:
start = col * 50
end = (col + 1) * 50
ax[col].barh(np.arange(50), np.sum(document_topics100, axis=0)[start:end])
ax[col].set_yticks(np.arange(50))
ax[col].set_yticklabel(topic_names[start:end], ha="left", va="top")
ax[col].invert_yaxis()
ax[col].set_xlim(0, 2000)
yax = ax[col].get_yaxis()
yax.set_tick_params(pad=130)
plt.tight_layout()