-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_doc_bert.py
277 lines (246 loc) · 10.5 KB
/
parse_doc_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# encoding=utf8
import os
import jieba
import re
import pickle
import numpy as np
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import Parser
from pyltp import NamedEntityRecognizer
from pyltp import SentenceSplitter
from scipy.spatial.distance import cosine
from bert_serving.client import BertClient
cws_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'cws.model')
pos_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'pos.model')
par_model_path = os.path.join(
os.path.abspath('./'), 'ltp_Model', 'parser.model')
ner_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'ner.model')
say_words_path = os.path.join(os.path.abspath(
'./'), 'data', 'saying_words.pickle')
segmentor = Segmentor() # 分词
postagger = Postagger() # 词性标注
recognizer = NamedEntityRecognizer() # 命名主体识别
parser = Parser() # 依存分析
segmentor.load(cws_model_path)
postagger.load(pos_model_path)
recognizer.load(ner_model_path)
parser.load(par_model_path)
# load saying words
say_words = pickle.load(open(say_words_path, 'rb'))
# 句子依存分析
def parsing(sentence):
words = segmentor.segment(sentence) # pyltp分词
postags = postagger.postag(words) # 词性标注
arcs = parser.parse(words, postags) # 句法分析
return arcs
# 命名实体识别
def get_name_entity(sentence):
# sentence = ''.join(strs)
words = segmentor.segment(sentence)
postags = postagger.postag(words) # 词性标注
netags = recognizer.recognize(words, postags) # 命名实体识别
return netags
def get_pos(sentence):
# sentence = ''.join(strs)
words = segmentor.segment(sentence)
postags = postagger.postag(words) # 词性标注
return postags
# 输入主语第一个词语、谓语、词语数组、词性数组,查找完整主语 find complete subject
# def get_name(name, predic, words, property, ne):
# index = words.index(name)
# cut_property = property[index + 1:] # 截取到name后第一个词语
# pre = words[:index] # 前半部分
# pos = words[index+1:] # 后半部分
# # 向前拼接主语的定语
# l = len(pre)-1
# for index, values in enumerate(pre):
# w_index = l - index
# w = pre[w_index]
# if property[w_index] == 'ADV':
# continue
# if property[w_index] in ['ATT', 'SBV']: # SBV?
# name = w + name
# else:
# break
# return name
# 获取谓语之后的言论
def get_saying(sentence, wp, heads, pos):
# word = sentence.pop(0) #谓语
# if ':' in sentence:
# # print('hear I am')
# return ''.join(sentence[sentence.index(':') + 1:])
proper = [w.relation for w in wp]
while pos < len(sentence):
w = sentence[pos]
p = proper[pos]
h = heads[pos]
# print('tttt:',w)
# 谓语尚未结束
if p in ['DBL', 'CMP', 'RAD']:
pos += 1
continue
# 定语
# if p == 'ATT' and proper[h - 1] != 'SBV':
# pos = h
# continue
# 宾语:直接宾语,间接宾语
if p in ['VOB', 'IOB']:
pos += 1
continue
# 处理报道<称>,提醒<说>这种情况
if w == '说' or w == '称':
pos += 1
continue
else:
if w == ',' or w == ':': # 说后面逗号跟着言论
return ''.join(sentence[pos + 1:])
else: # 说后面直接跟着言论
return ''.join(sentence[pos:])
# parse sentense
# boolean allnoun 控制是否把说前面的所有名字都作为name提取出来
def get_name(k, v, cuts,ne,pos,allnoun ):
name=''
saying=''
names=[]
if ne[k].__contains__('-N'):
if ne[k] in ['S-Ns', 'S-Ni', 'S-Nh']:
name = cuts[k]
# 如果主语是实体的第一部分,则从左到右搜索
elif ne[k].startswith('B-N'):
names = [cuts[k]]
for w, n in zip(cuts[k+1:], ne[k+1:]):
if n.startswith('I-N'):
names.append(w)
elif n.startswith('E-N'):
names.append(n)
break
# 如果主语是实体的最后一部分,则反响搜索
elif ne[k].startswith('E-N'):
names = [cuts[k]]
l = len(ne[:k])-1
for index, _ in enumerate(zip(cuts[:k], ne[:k])):
if ne[l-index].startswith('I-N'):
names.insert(0,cuts[l-index])
elif ne[l-index].startswith('B-N'):
names.insert(0,cuts[l-index])
break
name = ''.join(names)
name = ''.join(re.findall('\w+', name))
if allnoun and (not name) and pos[k] in ['n','nh','ni','nl','ns','nz'] :
name = cuts[k]
return name
def parse_sentence(sentence, ws=False):
cuts = list(segmentor.segment(sentence)) # pyltp分词
if not cuts:
return False
pos = get_pos(sentence)
ne = list(get_name_entity(sentence))
wp = parsing(sentence) # 依存分析
for k, v in enumerate(wp):
# 确定第一个主谓句,notice that parsed word_relation list index starts from 1
if v.relation == 'SBV' and (cuts[v.head-1] in say_words):
# get name entity
name = get_name(k, v, cuts,ne,pos,False)
if name :
saying = get_saying(cuts, wp, [
i.head for i in wp], v.head)
return name, saying
# name_saying_pairs = re.findall('(\w+)?:([^。!?;]+)',sentence)
# if name_saying_pairs: return name_saying_pairs[0][0],name_saying_pairs[0][1]
return False
# split doc into sentences
def split_sentence(doc):
# 提取类似“不要怕”黎明说,“天会亮”,并把前后的言论都合并到后面
# 黎明说,“不要怕,天会亮”
for_back_sens = re.findall(r'“(?:[^“]+)”(?:[^“\r\n]+),“(?:[^“]+)”',doc)
try:
for_back_sens = list(map(lambda x : re.findall(r'”(.*?)“',x)[0] + '“'+
''.join(re.findall(r'“(.*?)”',x)) + '”',for_back_sens))
except IndexError :
print(r'index error')
# 从原文中移除前后皆有的言论
doc = re.sub(r'“(?:[^“]+)”(?:[^“\r\n]+),“(?:[^“]+)”','',doc)
# replace conditions like 。” with mark <period>”。
doc = re.sub('(。|!|!|?|\.|\?)”', '<period>”。', doc)
doc = re.sub('"', '', doc)
# handle differet kinds of \r\n
doc = re.sub('(\n\r)|(\n)|(\r)', '\r\n<para_start>', doc)
# split doc into sentences
sens = re.split('[。!!\.?\?\r\n]', doc)
sens = list(filter(lambda x: len(x) > 3, sens))
sens = list(filter(str.strip, sens))
sens = list(filter(lambda x: not (
(x.startswith('<para_start>') and len(x) < 15)), sens))
# merge sentences between quotes “”
new = []
for index, s in enumerate(sens):
if ('“' in s) and (s.count('“')-s.count('”') == 1):
for index1, s1 in enumerate(sens[index + 1:]):
if s1.startswith('<para_start>'):
break
if ('”' in s1) and (s1.count('”')-s1.count('“') == 1):
new.append(','.join(sens[index: index + 1 + index1 + 1]))
sens[index: index + 1 + index1 +
1] = ['processed' for i in range(1 + index1 + 1)]
break
else:
if s != 'processed':
new.append(s)
# remove sentense with only '<para_start>' as content
new1 = []
while new:
s = new.pop(0)
if ((len(str.strip(s)) > 0) and s != '<para_start>'):
new1.append(s)
# add periods after last right quote for better entity recognition
doc_perods = ('<perods>'.join(new1)).replace('<period>”', '。”')
new1 = doc_perods.split('<perods>')
return new1 + for_back_sens
def clean_name_beta(name):
cuts = list(jieba.cut(name))
name = ''.join([cuts[index] for index, a in enumerate(
list(get_name_entity(name))) if 'N' in list(a)])
return name
def sents_similar(sen1, sen2,bc):
sens_embedings = bc.encode([sen1,sen2])
return cosine(sens_embedings[0],sens_embedings[1])
# parse documents, which will be spiltted into sentenses
def parse_doc(threshhold, doc,bc):
result = []
sens = split_sentence(doc)
original_sens = sens.copy()
original_index = 0
while sens:
first_sen = sens.pop(0)
first_sen = first_sen.replace('<para_start>', '')
saying = parse_sentence(first_sen, ws=False)
if saying and saying[1]:
saying_index = original_index
extended_saying = saying[1]
# check the followding setence, if they have the same meaning as the previouse one, then concatenate as the same saying
temp_sens = [first_sen] + sens # add first_sens for easy comparing
for index, pre in enumerate(temp_sens):
if index == len(temp_sens)-1:
break
if (not temp_sens[index + 1].startswith('<para_start>')) and \
sents_similar(pre,temp_sens[index + 1],bc) < threshhold:
extended_saying = extended_saying + \
',' + temp_sens[index + 1]
sens.pop(0)
original_index += 1
else:
break
# saying[0] indicates the one gives comment
result.append((saying[0], extended_saying, saying_index))
original_index += 1
return result
def test(doc,bc):
result = parse_doc(0.15, doc,bc)
for index, s in enumerate(result):
print(index, s, '\n')
if __name__ == '__main__':
bc = BertClient()
doc = '''据南方周末报道,“如果我们对这个人没有其他了解,我们知道他的优先事项完全是个人的、自恋的。我们知道,他并不担心国家的稳定或安全,鉴于这一系列心理现实,我们需要一个比我们所能保证的任何程度的信心更加坚定的进程,才能确保我们将在明年1月顺利、和平地过渡到一位新总统。”'''
test(doc,bc)
# print(list(get_pos(doc)),'\n',list(segmentor.segment(doc)))