-
Notifications
You must be signed in to change notification settings - Fork 0
/
PhoneticDataPreProcessing.py
421 lines (235 loc) · 10.7 KB
/
PhoneticDataPreProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
# coding: utf-8
# In[617]:
import multiprocessing
import re
import numpy as np
import pandas as pd
# cmudict的entries方法找出所有音素
import nltk
from nltk.corpus import stopwords # 这个stopwords.words("english")
from nltk.corpus import cmudict
# import scikit-learn里面的两个计算tf-idf必要的类
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# In[618]:
# 用cmudcit的hashmap版本,查询会更快
phonetic_check_dict = cmudict.dict()
# In[619]:
# cmudict.entries是一个list,里面的每个elements是一个tuple,tuple[0]是单词或字母,tuple[1]是对应的音素
# In[620]:
def get_data(filename):
data_origin = pd.read_csv(filename, encoding='utf-8')
return data_origin
# In[621]:
data_origin = pd.read_csv("./df.csv")
# In[622]:
def preprocessing_the_sents(sents_string):
sents_string = re.sub("\?", "", sents_string)
sents_string = re.sub("\.", "", sents_string)
sents_string = re.sub("!", "", sents_string)
sents_string = re.sub(";", "", sents_string)
sents_string = re.sub(":", "", sents_string)
sents_string = re.sub("-", " ", sents_string) # 如果是破折号有必要替换成空格符
sents_string = re.sub(",", "", sents_string)
# 如果是外层引号也应该去掉,但是不要去掉
sents_string = re.sub(r"(?<!.)'(?=.)", "", sents_string)
sents_string = re.sub(r"(?<=.)'(?!.)", "", sents_string)
# 双引号,中文引号
sents_string = re.sub(r'"', '', sents_string)
sents_string = re.sub(r'‘', '', sents_string)
# 还有各种奇葩符号
sents_string = re.sub("\*", "", sents_string)
sents_string = re.sub("&", "", sents_string)
sents_string = re.sub("#", "", sents_string)
# 最后处理掉多余的空格符
sents_string = re.sub(" +", " ", sents_string)
# 所有字母小写化,适应cmudict的key的需求
sents_string = sents_string.lower()
# 打散所有字符串,变成list
sents_string_list = sents_string.split(" ")
return sents_string_list
# In[623]:
# 应用这个函数清洗句子
data_processed_list = data_origin["text"].apply(preprocessing_the_sents)
# In[624]:
# 拼回原表
data_origin = pd.concat([data_origin, data_processed_list], axis = 1)
data_origin.columns = ['key', 'text', 'text_list']
# In[625]:
# test_passed
# 作出两个版本的,一个有strip_stopwords的,一个没有的
def strip_the_stopwords(words_list):
# 如果不是停止词就允许添加,否则就直接return false
words_list = list(
map(lambda x:x if x not in stopwords.words("english") else None, words_list)
)
for item_index in range(len(words_list) - 1, -1, -1): # 倒序遍历,确保不会出现正序删除之后的错误
if words_list[item_index] == None:
del words_list[item_index]
return words_list
# In[626]:
# count单词数
def count_words(words_list):
return len(words_list)
# In[627]:
# 计算每个列表的音素数
# In[628]:
# 返回两个数据,进入一个tuple,这个tuple再做两个apply切分成两列
def count_phonetics(words_list):
phonetic_length = 0
exception_list = []
for words in words_list:
# 如果不是这个列表里面的,放进exception_list
try:
phonetic_check_dict[words]
phonetic_length += len(phonetic_check_dict[words][0])
except:
exception_list.append(words)
return phonetic_length, exception_list
# In[629]:
arrays_with_words_count_SW = data_origin["text_list"].apply(count_words)
# In[630]:
arrays_with_words_count_SW.name = "series_with_words_count_SW"
# In[631]:
data_origin = pd.concat([data_origin, arrays_with_words_count_SW], axis=1)
# In[632]:
arrays_with_count_and_exception = data_origin["text_list"].apply(count_phonetics)
# In[633]:
series_with_phonetic_counts = arrays_with_count_and_exception.apply(lambda x:x[0])
# In[634]:
series_with_phonetic_counts.name = "series_with_phonetic_counts_with_SW"
# In[635]:
series_with_exception_list = arrays_with_count_and_exception.apply(lambda x:x[1])
# In[636]:
series_with_exception_list.name = "series_with_exception_list_with_SW"
# In[637]:
data_origin = pd.concat([data_origin, series_with_phonetic_counts], axis=1)
data_origin = pd.concat([data_origin, series_with_exception_list], axis=1)
# In[638]:
text_list_without_stopwords = data_origin["text_list"].apply(strip_the_stopwords)
# In[639]:
text_list_without_stopwords.name = "text_list_without_stopwords"
# In[640]:
data_origin = pd.concat([data_origin, text_list_without_stopwords], axis = 1)
# In[641]:
arrays_with_words_count_no_SW = data_origin["text_list_without_stopwords"].apply(count_words)
# In[642]:
arrays_with_words_count_no_SW.name = "series_with_words_count_no_SW"
# In[643]:
data_origin = pd.concat([data_origin, arrays_with_words_count_no_SW], axis=1)
# In[644]:
arrays_with_count_and_exception = data_origin["text_list_without_stopwords"].apply(count_phonetics)
# In[645]:
series_with_phonetic_counts_no_SW = arrays_with_count_and_exception.apply(lambda x:x[0])
# In[646]:
series_with_phonetic_counts_no_SW.name = "series_with_phonetic_counts"
# In[647]:
series_with_exception_list_no_SW = arrays_with_count_and_exception.apply(lambda x:x[1])
# In[648]:
series_with_exception_list_no_SW.name = "series_with_exception_list"
# In[649]:
data_origin = pd.concat([data_origin, series_with_phonetic_counts_no_SW], axis=1)
data_origin = pd.concat([data_origin, series_with_exception_list_no_SW], axis=1)
# In[650]:
def get_the_corpus(df_origin):
# 获取corpus,用处理好的数据重新获取
corpus_with_SW = [sents for sents in data_origin["text_list"].apply(lambda x:" ".join(x))]
corpus_without_SW = [sents for sents in data_origin["text_list_without_stopwords"].apply(lambda x:" ".join(x))]
return corpus_with_SW, corpus_without_SW
# In[651]:
# 这个是被多进程双开的函数
def cal_tf_idf(corpus):
# 向量化,算出这个文本的词的向量表达情况
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
# 这个并不直接计算tf_idf,这个只是显示出高频词汇
word = vectorizer.get_feature_names()
# 定义transformer
transformer = TfidfTransformer()
tf_idf = transformer.fit_transform(X)
# 然后得到相应的数组,转换成series之后重新拼接
# 如果想保留信息,要转换成str之后再存储为Series
array_checked = tf_idf.toarray()
# 保存原始信息
check_list = pd.Series(map(str, array_checked))
# 算出max,min和mean
max_of_texts = pd.Series(map(max, array_checked))
min_of_texts = pd.Series(map(min, array_checked))
mean_of_texts = pd.Series(map(np.mean, array_checked))
# 添加一个功能,计算特殊的tf_idf,只要有这个corpus就足够了
splited_words_length_list = list(map(lambda x:x.split(" "), corpus_full))
each_sents_words_count = pd.Series(map(lambda x:len(x), splited_words_length_list))
# 完成了count的series,求出sum,然后series相除,得到需要的值
tf_idf_sum_Series = pd.Series(map(sum, array_checked))
# 相除之后得到一个Series,就是需要的Series
customized_tf_idf_result_series = tf_idf_sum_Series/each_sents_words_count
return max_of_texts, min_of_texts, mean_of_texts, customized_tf_idf_result_series
# In[652]:
def mp_cal_td_idf(target_func, corpus_with_SW, corpus_without_SW):
# tasks组建的时候,key是进程name,value是进程的args,因为target统一是cal_tf_idf
tasks = {"td_idf_SW":corpus_with_SW, "td_idf_no_SW":corpus_without_SW}
# 没有数据分块切割,就是两个进程就足够了
processes_pool = multiprocessing.Pool(processes=2)
processes_result = list()
for task_name, task_args in tasks.items():
# 动态生成两个任务, 顺序将结果存储,导出,apply_async非阻塞,直接apply是阻塞
# 传入的两个位置的参数需要是args=(task_args, )以完成需求
result = processes_pool.apply_async(target_func, args=(task_args, ))
processes_result.append(result)
processes_pool.close() # 关闭进程池,通知不再加入新进程
processes_pool.join() # 阻塞主进程等待pool里面所有进程结束返回值
# 对在进程池的两个结果进行返回,一共两个结果,返回的是三个的集合tuple
td_idf_with_SW = processes_result[0]
td_idf_no_SW = processes_result[1]
# return的结果需要在外面用get的方法获取数值
return td_idf_with_SW, td_idf_no_SW
# In[653]:
def get_td_idf(df_origin):
# 首先获取corpus
corpus_sets = get_the_corpus(df_origin)
corpus_SW = corpus_sets[0]
corpus_no_SW = corpus_sets[1]
# 然后使用多进程产生结果
# 这两个结果是max, min, mean的集合tuple
result_set = mp_cal_td_idf(cal_tf_idf, corpus_SW, corpus_no_SW)
return result_set
# In[654]:
# 算出了六列tf_idf相关的数据
# 六列的顺序是:第一个大tuple里面是有SW的版本
# 第二个大tuple里面是没有SW的版本
# 两个tuple里面都是max,min, mean的顺序
# 最后一个是特殊的顺序的Series
tf_idf_set = get_td_idf(data_origin)
# In[667]:
tf_idf_SW = tf_idf_set[0].get()
tf_idf_max_SW = tf_idf_SW[0]
tf_idf_min_SW = tf_idf_SW[1]
tf_idf_mean_SW = tf_idf_SW[2]
tf_idf_customized_SW = tf_idf_SW[3]
tf_idf_no_SW = tf_idf_set[1].get()
tf_idf_max_no_SW = tf_idf_no_SW[0]
tf_idf_min_no_SW = tf_idf_no_SW[1]
tf_idf_mean_no_SW = tf_idf_no_SW[2]
tf_idf_customized_no_SW = tf_idf_no_SW[3]
# In[669]:
tf_idf_max_SW.name = "tf_idf_max_SW"
tf_idf_min_SW.name = "tf_idf_min_SW"
tf_idf_mean_SW.name = "tf_idf_mean_SW"
tf_idf_max_no_SW.name = "tf_idf_max_no_SW"
tf_idf_min_no_SW.name = "tf_idf_min_no_SW"
tf_idf_mean_no_SW.name = "tf_idf_mean_no_SW"
tf_idf_customized_SW.name = "tf_idf_customized_SW"
tf_idf_customized_no_SW.name = "tf_idf_customized_no_SW"
# In[671]:
data_origin = pd.concat([data_origin, tf_idf_max_SW], axis = 1)
data_origin = pd.concat([data_origin, tf_idf_min_SW], axis = 1)
data_origin = pd.concat([data_origin, tf_idf_mean_SW], axis = 1)
data_origin = pd.concat([data_origin, tf_idf_max_no_SW], axis = 1)
data_origin = pd.concat([data_origin, tf_idf_min_no_SW], axis = 1)
data_origin = pd.concat([data_origin, tf_idf_mean_no_SW], axis = 1)
data_origin = pd.concat([data_origin, tf_idf_customized_SW], axis = 1)
data_origin = pd.concat([data_origin, tf_idf_customized_no_SW], axis = 1)
# In[672]:
data_final = data_origin.applymap(lambda x:x if x != [] else None)
# In[674]:
data_final.to_csv("dsr_preprocessed.csv", encoding="utf-8")