-
Notifications
You must be signed in to change notification settings - Fork 0
/
raw_preprocess_util.py
215 lines (197 loc) · 7.71 KB
/
raw_preprocess_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import gzip
import pandas as pd
from pandas import DataFrame
from datetime import datetime
import nltk
from nltk.tokenize import sent_tokenize
from ast import literal_eval
from nltk.stem.porter import PorterStemmer
import re
def parse(path):
"""
:param path: data path
:return: read data line by line
"""
g = gzip.open(path, 'r')
for l in g:
yield eval(l)
def load_data(path, year):
"""
:param path: data path
:param year: int, 연도
:return: year 이상 data 추출
"""
data_list = []
for e in parse(path):
e['overall']=float(e['overall'])
e['unixReviewTime'] = datetime.fromtimestamp(int(e['unixReviewTime'])).strftime('%Y-%m-%d')
if int(e['unixReviewTime'].split('-')[0]) >= year:
row = list([e['unixReviewTime'],e['asin'],e['reviewerID'],e['overall'],e['helpful'],e['reviewText']])
data_list.append(row)
else:
continue
return data_list
def load_meta(path):
"""
:param path: data path
:return: asin, title, brand from meta data
"""
meta_data = []
for e in parse(path):
row = list([e['asin'], e.get('title'), e.get('brand')])
meta_data.append(row)
return meta_data
def join_meta_data(review, meta):
review_df = DataFrame(review, columns = ["reviewTime","asin","reviewerID","overall","helpful","reviewText"])
meta_df = DataFrame(meta, columns = ["asin","title","brand"])
join_data_df = pd.merge(review_df, meta_df, on = "asin", how="left")
join_data_df.dropna(subset=['brand', 'reviewText'], how='any', inplace=True)
join_data_df = join_data_df[join_data_df.brand != "Unknown"]
return join_data_df
def extract_sentence(df):
"""
:param df: meta + raw review 를 join 한 dataframe
:return: review sentence, length of sentence, helpful score
"""
df.dropna(subset=['brand', 'reviewText'], how='any', inplace=True)
df = df[df.brand != "Unknown"]
sentences = df.apply(lambda row: sent_tokenize(row['reviewText']), axis=1)
df['reviewSentence'] = sentences
sent_length = df.apply(lambda row: len(row['reviewSentence']), axis = 1)
df['sent_length'] = sent_length
df['helpful'] = df.helpful.apply(lambda x:literal_eval(x)[0])
return df
def top_brands(data, topn=50):
"""
get top brands dataframe (review count 기준)
:param data: total dataframe
:param topn: num of top brands
:return: top brands data, top brands list
"""
grouped = data.groupby('brand')
s = grouped.count()
brand_name = s.sort_values('asin', ascending=False).index[:topn]
return data[data['brand'].isin(brand_name)], brand_name
def sort_helpful_score(data, brand, score = 5000):
"""
해당 브랜드의 리뷰를 helpful score로 정렬했을때,
5000개가 넘으면 상위 5000개, 아니면 전부 가져옴
:param data: total data
:param brand: 특정 브랜드 e.g. Samsung, Apple
:return: 특정 브랜드의 상위 brand sorted by helpful score
"""
brand_df = data[data.brand == brand]
result = brand_df.sort_values('helpful', ascending=False)
if result.shape[0] >= score:
return result[:score]
else:
return result
def sample_data(data, brands):
"""
sort_helpful_score 함수를 기준으로 정렬한 데이터를 합치는 함수
:param data: total data
:param brands: top brands list (from top_brands 함수)
:return: 해당 category의 최종 sampling 데이터
"""
top_df_list = []
for brand in brands:
top_df_list.append(sort_helpful_score(data, brand))
result = pd.concat(top_df_list, axis=0)
result.reset_index(drop=True, inplace=True)
return result
def del_element_by_indice(lst, indice):
"""
해당 indice에 들어가 있는 index의 원소를 제거
"""
new_lst = []
for index, value in enumerate(lst):
if index-1 in indice:
continue
else:
new_lst.append(value)
return new_lst
def sentence_postag(reviewSentence):
"""
형태소 분석 by sentence
tokenize : nltk.word_tokenize + '.','/' 으로 분할
"""
re_split = re.compile('[/.-]')
tokenize = [nltk.word_tokenize(sent) for sent in reviewSentence]
tokenize2 = []
for sent in tokenize:
sent_token = []
for word in sent:
if bool(re_split.search(word)): # /, . 이 1개 이상 있으면 split
token = re_split.split(word)
sent_token.extend(token)
else:
sent_token.append(word)
sent_token = [word for word in sent_token if len(word)>0] #길이가 0인 문자열 제거
tokenize2.append(sent_token)
tagged = nltk.pos_tag_sents(tokenize2)
return tagged
def brand2vec_preprocess(tagged_by_Sentence):
"""
1. 특수문자 제거, 소문자
2. 숫자 제거
"""
st = PorterStemmer()
re_special = re.compile('[^A-Za-z0-9]+') # 문자,숫자 제외한 나머지(=특수문자)
re_num = re.compile('[0-9]+') # 숫자
re_adj = re.compile('[JJ.*]')
new_sent = []
adjectives = [] #많이 쓰이는 형용사 목록을 찾기 위해
total_tokens_by_review = [] #해당 브랜드의 token 개수 새기 위해서
for sent in tagged_by_Sentence:
# text = []
# for sent in review:
# for tup in sent:
# if not bool(re_special.match(tup[0])) and not bool(re_num.match(tup[0])):
# text.append((tup[0].lower(), tup[1]))
text = [(tup[0].lower(), tup[1]) for tup in sent if not bool(re_special.match(tup[0])) and not bool(re_num.match(tup[0]))] # 1. 특수문자,숫자 제거, 소문자
tokens = [tup[0] for tup in text]
total_tokens_by_review.append(tokens)
adjective = [st.stem(tup[0]) for tup in text if re_adj.match(tup[1])]
adjectives.extend(adjective)
text = [tup[0] for tup in text]
new_sent.append(text)
return new_sent, adjectives, total_tokens_by_review
def preprocessing(tagged_by_Sentence):
"""
1. 특수문자 제거, 소문자
2. not, "n't" -> not_stemming(다음단어)
3. 특수문자 제거, 숫자 제거
4. stopword 제거
5. stemming
"""
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords.remove("not")
stopwords.remove('very')
stopwords.append("'m")
stopwords.append("'s")
re_special = re.compile('[^A-Za-z0-9]+') # 문자,숫자 제외한 나머지
re_num = re.compile('[0-9]+') # 숫자
st = PorterStemmer()
new_sent = []
not_indice = []
for sent in tagged_by_Sentence:
text = [(tup[0].lower(), tup[1]) for tup in sent if not bool(re_special.match(tup[0]))] # 1. 특수문자 제거, 소문자
# 2. not, n't 랑 다음단어 합치기
# not, n't 가 나오면 다음 단어랑 합치고, 그 다음 단어의 index를 저장해놨다가 del_element_by_indice 함수에서 제거
new_text = []
for index, tup in enumerate(text):
if tup[0] == "n't" or tup[0] == "not":
if index + 1 < len(text):
if not bool(re_special.match(text[index + 1][0])) or text[index + 1][1] != 'CD':
new_text.append("not_" + st.stem(text[index + 1][0]))
not_indice.append(index)
else:
new_text.append("not")
else:
if not bool(re_num.match(tup[0])) or tup[1] != 'CD': # 3. 특수문자, 숫자 제거
new_text.append(tup[0])
new_text = del_element_by_indice(new_text, not_indice)
new_words = [st.stem(word) for word in new_text if word not in stopwords] # 4,5 stopword 제거, stemming
new_sent.append(new_words)
return new_sent