def format_litigants(self, plaintiffs, defendants, litigants): '''格式化当事人,被告,原告''' for replace_str in ktgg_conf.defendant_keyword_list: litigants = litigants.replace(replace_str, "") plaintiffs = plaintiffs.replace(replace_str, "") defendants = defendants.replace(replace_str, "") for replace_str in ktgg_conf.plaintiff_keyword_list: litigants = litigants.replace(replace_str, "") plaintiffs = plaintiffs.replace(replace_str, "") defendants = defendants.replace(replace_str, "") replace_str_list = self.strip_list + ktgg_conf.replace_str_list + ktgg_conf.format_str_list for replace_str in replace_str_list: replace_str = unicode(replace_str) litigants = litigants.replace(replace_str, "") plaintiffs = plaintiffs.replace(replace_str, "") defendants = defendants.replace(replace_str, "") plaintiff_list = toolsutil.my_split(plaintiffs, self.litiants_seps) defendant_list = toolsutil.my_split(defendants, self.litiants_seps) litigant_list = toolsutil.my_split(litigants, self.litiants_seps) defendant_list = [ x for x in defendant_list if x and len(unicode(x)) >= self.min_litigant_len and len(unicode(x)) <= self.max_litigant_len ] plaintiff_list = [ x for x in plaintiff_list if x and x not in defendant_list and len(unicode(x)) >= self.min_litigant_len and len(unicode(x)) <= self.max_litigant_len ] litigant_list = [ x for x in litigant_list if x and len(unicode(x)) >= self.min_litigant_len and len(unicode(x)) <= self.max_litigant_len ] if litigant_list == []: if plaintiff_list: litigant_list = litigant_list + plaintiff_list if defendant_list: litigant_list = litigant_list + defendant_list litigant_list = sorted(litigant_list) litigants = ",".join(litigant_list) return litigant_list, litigants, plaintiff_list, defendant_list
def replace_field(self, data_list, extract_data): '''映射字段名''' province = extract_data.get('province') publish_time = extract_data.get('publish_time') entity_data_list = [] sep_list = ['\t', ' ', ',', ' ', ',', ':', ":"] for data in data_list: entity_data = {} for key, value in data.items(): for sep in sep_list: key = unicode(key).replace(sep, '') if penalty_conf.name_replace_map.has_key(key): print key, type(key) real_key = penalty_conf.name_replace_map[key] if real_key == 'execute_authority_time': temp_list = toolsutil.my_split(value, sep_list) if len(temp_list) == 2: entity_data['execute_authority'] = temp_list[0] entity_data['penalty_time'] = temp_list[1] else: entity_data['execute_authority'] = value[:value.find(u'局') + 1] entity_data['penalty_time'] = self.parser_tool.date_parser.get_date_list(value) else: entity_data[real_key] = value entity_data['province'] = province entity_data['publish_time'] = publish_time entity_data_list.append(entity_data) return entity_data_list
def get_money_list(self, judge_content): '''获取涉案最大金额''' tmp_max_money_list = toolsutil.my_split(judge_content, [',', ',', '。']) ret_list = toolsutil.re_findall(self.money_regex, unicode(judge_content)) money_list = [] for row_content in tmp_max_money_list: ret_chs = toolsutil.re_findone(self.money_regex_chs, unicode(row_content)) if ret_chs: chs_money = self.money_parser.trans_chs_money(ret_chs) money_list.append(float(chs_money[0])) if ret_list: for ret in ret_list: digit_money = self.money_parser.transfer_money(ret) money_list.append(float(digit_money[0])) if money_list == []: ret_list2 = toolsutil.re_findall(self.money_regex_last, unicode(judge_content)) if ret_list2: for ret in ret_list2: digit_money = self.money_parser.transfer_money(ret) money_list.append(float(digit_money[0])) if money_list != []: max_money = max(money_list) sum_money = sum(money_list) else: max_money = 0 sum_money = 0 return max_money, sum_money
def get_plaintiff_defendant(self, litigants_content, case_id, court, case_cause_list): '''获取原告和被告''' #1 格式化当事人内容 for cause in case_cause_list: litigants_content = litigants_content.replace(cause, '') case_id = case_id.replace('(', '(').replace(')', ')') litigants_content = litigants_content.replace('(', '(').replace(')', ')') replace_court = court if court else '' litigants_content = litigants_content.replace(case_id, '').replace( replace_court, '') src_content = litigants_content for replace_str in ktgg_conf.replace_str_list: src_content = src_content.replace(replace_str, "") plaintiffs, defendants, litigants = self.get_plaintiff_defendant_bykeyword( src_content) #2 按照上诉/诉等关键字分割原告和被告 if plaintiffs == "" and defendants == "": tmp_list = toolsutil.my_split(litigants_content, ktgg_conf.litigants_sep_list) if len(tmp_list) == 2: plaintiffs = tmp_list[0] defendants = tmp_list[1] litigants = ','.join(tmp_list) else: litigants = litigants_content return plaintiffs, defendants, litigants
def format_extract_data(self, extract_data, topic_id): '''实体解析入口''' entity_data = copy.deepcopy(extract_data) province = None if entity_data.has_key("court"): court = extract_data.get('court') province = self.parser_tool.province_parser.get_province(court) defendants = extract_data.get("defendants","") plaintiffs = extract_data.get("plaintiffs","") defendant_list = [] plaintiff_list = [] if defendants and plaintiffs: plaintiff_list = toolsutil.my_split(plaintiffs,self.litiants_seps) defendant_list = toolsutil.my_split(defendants, self.litiants_seps) info = {} if entity_data.has_key("bulletin_content"): content = extract_data.get('bulletin_content') case_cause = self.parser_tool.case_cause_parser.get_case_cause(content) case_id = self.parser_tool.caseid_parser.get_case_id(content) info = self.parser_tool.fygg_parser.do_parser(content) info["case_cause"] = case_cause info["case_id"] = case_id if defendant_list == [] and plaintiff_list == []: defendant_list = info.get("defendant_list",[]) plaintiff_list = info.get("plaintiff_list", []) litigant_list = list(set(plaintiff_list + defendant_list)) litigants = ','.join(litigant_list) entity_data["norm_bulletin_content"] = info.get("norm_content") entity_data["case_id"] = info.get("case_id") entity_data["case_cause"] = info.get("case_cause") entity_data["entity_list"] = info.get("entities") entity_data["plaintiff_list"] = plaintiff_list entity_data["defendant_list"] = defendant_list entity_data["bulletin_type"] = info.get("bulletin_type") entity_data["litigant_list"] = litigant_list entity_data["litigants"] = litigants return entity_data
def format_litigants(self, plaintiff_list, defendant_list, litigants): '''格式化当事人,被告,原告''' litigant_list = toolsutil.my_split(litigants, self.seps) plaintiff_list = [ unicode(x) for x in plaintiff_list if x in litigant_list ] defendant_list = [ unicode(x) for x in defendant_list if x in litigant_list ] for wrong_str in self.wenshu_conf.wrong_str_list: plaintiff_list = [x for x in plaintiff_list if wrong_str not in x] defendant_list = [x for x in defendant_list if wrong_str not in x] return litigant_list, plaintiff_list, defendant_list
def get_court(self, content): """获取法院""" court = '' content = unicode(content) content_list = toolsutil.my_split(content, self.seps) for row_content in content_list: if not row_content: continue row_content = unicode(row_content) ret = toolsutil.re_find_one(self.court_regex, row_content) if ret and len(ret) <= self.max_court_length: court = ret break if court == '': court = self.get_court2nd(content) return court
def format_extract_data(self, extract_data, topic_id): '''解析数据''' entity_data = copy.deepcopy(extract_data) tmp_max_money = extract_data.get("duty", "") tmp_max_money = unicode(tmp_max_money.replace(" ", "")) tmp_max_money_list = toolsutil.my_split(tmp_max_money, [',', ',', '。']) ret_list = toolsutil.re_findall(self.money_regex, tmp_max_money) money_list = [] for row_content in tmp_max_money_list: ret_chs = toolsutil.re_findone(self.money_regex_chs, unicode(row_content)) if ret_chs: chs_money = self.parser_tool.money_parser.trans_chs_money( ret_chs) money_list.append(float(chs_money[0])) if ret_list: for ret in ret_list: digit_money = self.parser_tool.money_parser.transfer_money(ret) money_list.append(float(digit_money[0])) if money_list == []: ret_list2 = toolsutil.re_findall(self.money_regex_last, tmp_max_money) if ret_list2: for ret in ret_list2: digit_money = self.parser_tool.money_parser.transfer_money( ret) money_list.append(float(digit_money[0])) if money_list == []: ret3 = toolsutil.re_findone(self.money_regex3, tmp_max_money) if ret3 and ret3 == tmp_max_money: money_list.append(float(ret3)) if money_list != []: max_money = max(money_list) sum_money = sum(money_list) else: max_money = 0 sum_money = 0 court = entity_data.get("court", "") province = self.parser_tool.province_parser.get_province(court) entity_data["max_money"] = max_money entity_data["sum_money"] = sum_money entity_data["province"] = province return entity_data
def get_court_place(self, content): '''获取开庭地点''' content = unicode(content).replace(" ", "") content_list = toolsutil.my_split(content, self.seps) court_place = self.parser_tool.court_place_parser.get_court_place( content) if not court_place: for row_content in content_list: court_place_list = toolsutil.re_findone( self.court_place_regex, unicode(row_content)) if court_place_list: for item in court_place_list: if item: court_place = item break if court_place and len(court_place) < self.court_place_len: for replace_str in ktgg_conf.court_place_replace_str_list: court_place = court_place.replace(replace_str, '') break return court_place
def get_plaintiff_defendant_bykeyword(self, litigants_content): #1 按照原告被告关键字分割原告被告 defendants = plaintiffs = litigants = "" src_content = litigants_content for replace_str in ktgg_conf.defendant_keyword_list: litigants_content = litigants_content.replace(replace_str, "") for replace_str in ktgg_conf.plaintiff_keyword_list: litigants_content = litigants_content.replace(replace_str, "") litigants_list = toolsutil.my_split(litigants_content, self.litiants_seps) litigants = ','.join(litigants_list) litigants_content = toolsutil.utf8_encode(src_content) plaintiff_ret = self.plaintiff_index.query(litigants_content) defendant_ret = self.defendant_index.query(litigants_content) if defendant_ret or plaintiff_ret: plaintiff_list = [] defendant_list = [] for litigant in litigants_list: litigant = unicode(litigant) if plaintiff_ret: plaintiffs_key = unicode(plaintiff_ret[0][1]) tmp = plaintiffs_key + litigant if tmp in src_content: plaintiff_list.append(litigant) if defendant_ret: defendants_key = unicode(defendant_ret[0][1]) tmp = defendants_key + litigant if tmp in src_content: defendant_list.append(litigant) plaintiffs = ','.join(plaintiff_list) defendants = ','.join(defendant_list) return plaintiffs, defendants, litigants
def get_data_from_content(self, extract_data): '''抽取数据中有内容''' src_content = extract_data.get("content") content = unicode(src_content).replace(" ", "") content_list = toolsutil.my_split(content, self.seps) content = self.norm_content(content) #1 解析案由 case_cause_list = [] if not extract_data.has_key("case_cause"): case_cause_list = self.parser_tool.case_cause_parser.get_case_causes( content) case_cause = ','.join(case_cause_list) extract_data["case_cause"] = case_cause else: extract_data["case_cause"] = extract_data["case_cause"].replace( u"一案", "") case_cause_list.append(extract_data["case_cause"]) #2 解析案号 if not extract_data.has_key("case_id"): case_id = self.parser_tool.caseid_parser.get_case_id(content) extract_data["case_id"] = case_id #3 解析法院 if not extract_data.has_key("court"): extract_data["court"] = self.get_court(content) else: court_list = toolsutil.my_split(extract_data.get("court", ""), [':', ':']) if len(court_list) > 0: extract_data["court"] = court_list[-1] #4 解析法官 if not extract_data.has_key("judge"): judge = "" src_content = unicode(src_content) tmp_content_list = toolsutil.my_split(src_content, ['\r\n', '\r', '\n']) for row_content in tmp_content_list: judge = toolsutil.re_findone(self.judge_regex, unicode(row_content)) if judge: judge_list = toolsutil.my_split(judge, [' ', ',', ':']) judge = ','.join(judge_list) break else: judge = "" extract_data["judge"] = judge #5 解析开庭时间 if not extract_data.has_key("court_time"): for row_content in content_list: court_time = toolsutil.re_findone(self.court_time_regex, unicode(row_content)) if court_time: for week in ktgg_conf.week_day_list: court_time = court_time.replace(week, " ") else: court_time = "" extract_data["court_time"] = court_time break #6 解析开庭地点 if not extract_data.has_key("court_place"): extract_data["court_place"] = self.get_court_place(content) #7 解析当事人/原告/被告 #---获取当事人内容 litigants = plaintiffs = defendants = '' find_flag = False src_content = src_content.replace(" ", "") tmp_content_list = toolsutil.my_split(src_content, ['\r', '\n', '。']) for litigant_regex in self.litigant_regex_list: for row_content in tmp_content_list: litigants = toolsutil.re_findone(litigant_regex, unicode(row_content)) if litigants: find_flag = True break if find_flag: break else: litigants = "" if litigants == "": litigants = content #---通过当事人内容获取原告被告 plaintiffs, defendants, litigants = self.get_plaintiff_defendant( litigants, extract_data.get("case_id", ""), extract_data.get("court", ""), case_cause_list) if litigants: litigant_list, litigants, plaintiff_list, defendant_list = self.format_litigants( plaintiffs, defendants, litigants) else: litigant_list = plaintiff_list = defendant_list = [] for item in self.strip_list: extract_data['content'] = extract_data.get('content').replace( item, ' ') return self.get_entity_data(extract_data, litigant_list, litigants, plaintiff_list, defendant_list)
def get_data_from_litigants(self, extract_data): '''抽取数据中无内容,从当事人中抽取原告被告''' court = unicode(extract_data.get("court", "").strip()) court_place = unicode(extract_data.get("court_place", "").strip()) new_court = self.court_kv.get(court, "") if new_court: extract_data["court"] = new_court court = self.parser_tool.court_parser.get_court( extract_data.get("court", "")) if not court_place: extract_data[ "court_place"] = self.parser_tool.court_place_parser.get_court_place( extract_data.get("court", "")) if court: extract_data["court"] = court plaintiffs = extract_data.get("plaintiffs", "") defendants = extract_data.get("defendants", "") litigants = extract_data.get("litigants", "") if litigants == "": litigants = plaintiffs + ',' + defendants if litigants.startswith(',') or litigants.endswith(','): litigants = "" if litigants: litigants = unicode(litigants) for replace_str in ktgg_conf.replace_str_list: litigants = litigants.replace(replace_str, "") tmp_list = toolsutil.my_split(litigants, [';', ';', '\r\n', '\t', '\n', '\r']) if len(tmp_list) == 2: defendants_ret = self.defendant_index.query(tmp_list[0]) plaintiffs_ret = self.plaintiff_index.query(tmp_list[0]) if defendants_ret: defendants = tmp_list[0] plaintiffs = tmp_list[1] else: if plaintiffs_ret: defendants = tmp_list[1] plaintiffs = tmp_list[0] else: plaintiff_list = extract_data.get("plaintiff_list", "") defendant_list = extract_data.get("defendant_list", "") litigant_list = extract_data.get("litigant_list", "") if isinstance(plaintiff_list, basestring): plaintiff_list = toolsutil.my_split( extract_data.get("plaintiff_list", ""), self.litiants_seps) defendant_list = toolsutil.my_split( extract_data.get("defendant_list", ""), self.litiants_seps) litigant_list = toolsutil.my_split( extract_data.get("litigant_list", ""), self.litiants_seps) plaintiffs = ','.join(plaintiff_list) defendants = ','.join(defendant_list) litigants = ','.join(litigant_list) litigant_list, litigants, plaintiff_list, defendant_list = self.format_litigants( plaintiffs, defendants, litigants) return self.get_entity_data(extract_data, litigant_list, litigants, plaintiff_list, defendant_list)
def get_parser_data(self, content, bulletin_type): '''获取实体信息,当事人,原告,被告,公告类型''' plaintiff_list = [] defendant_list = [] norm_content = unicode(content).replace(" ", "") content_list = toolsutil.my_split(norm_content, [',', ',', '。', '\r\n', '\t']) find_flag = False #1 获取原告 for rowcontent in content_list: for plaintiff_regex in self.plaintiff_regex_list: ret = toolsutil.re_findone(plaintiff_regex, unicode(rowcontent)) if ret: plaintiff_list = toolsutil.my_split( ret, self.litiants_seps) #print "原告:",plaintiff_regex.pattern,','.join(plaintiff_list) find_flag = True break if find_flag: break #2 获取被告 if unicode(bulletin_type) in self.bulletin_type_list: find_flag = False for rowcontent in content_list: for defendant_regex in self.defendant_regex_list: ret = toolsutil.re_findone(defendant_regex, unicode(rowcontent)) if ret: if u'你' in unicode(ret): defendant_list = toolsutil.my_split( content_list[0], self.litiants_seps) else: defendant_list = toolsutil.my_split( ret, self.litiants_seps) if plaintiff_list == []: plaintiff_list = defendant_list defendant_list = [] # print "被告:", defendant_regex.pattern, ','.join(defendant_list) # print "原告:", ','.join(plaintiff_list) find_flag = True break for defendant_pattern in fygg_conf.defendant_pattern_list: ret = toolsutil.re_find_one(defendant_pattern, unicode(rowcontent)) if ret: defendant_list = toolsutil.my_split( ret, self.litiants_seps) find_flag = True break if find_flag: break plaintiff_list, defendant_list = self.format_litigant( plaintiff_list, defendant_list, fygg_conf.litigant_replace_str_list) else: content_list = toolsutil.my_split(norm_content, ['。', '\r\n', '\t', ',']) for rowcontent in content_list: tmp_list = re.split(':|:|;', rowcontent) if len(tmp_list) == 2: defendant_list = toolsutil.my_split( tmp_list[0], self.litiants_seps) replace_str_list = fygg_conf.defendant_keyword_list + fygg_conf.plaintiff_keyword_list plaintiff_list, defendant_list = self.format_litigant( plaintiff_list, defendant_list, replace_str_list) break info = { "plaintiff_list": plaintiff_list, "defendant_list": defendant_list, "bulletin_type": bulletin_type, } return info