def to_ot_process_error(self, old, todat): """此方法用于写库,如有需求写入新库可以调用此方法。 """ print old.id, old.url olds = Otraw print "[Analysis to %s] time: %s " % (todat, time.strftime('%y-%m-%d %H-%M-%S')) if ('openlaw' in old.url): self.point.set_tablename(Open) self.point.set_filter(filter='id = %s' % old.parent_id, limit=1) old_data = self.point.query() else: self.point.set_tablename(olds) self.point.set_filter(filter='id = %s' % old.parent_id, limit=1) old_data = self.point.query() if not old_data: return old_data = old_data[0] analy = Analyse() analy.text = old_data.content_all _header, part, content, case_sign = analy.split_to_four_parts() #: 开始检测数据错误 ero = Judgment_checking(_header, part, content, case_sign) #: 检查案由 ero.Checking_anyou(old.anyou) #: 检查原告被告 ero.Checking_people(old.plaintiff, old.defendant) #: 检查原告被告律师 ero.Checking_lawyer(old.plaintiff_lawyers, old.defendant_lawyers) #: 检查署名信息 ero.Checking_sign() if ero.errors: # return list(set(ero.errors)) for item in ero.errors: new = ot_process_error() new.judge_id = old.id new.action = item[0] new.error = item[1] new.user_name = 'System' new.addtime = str(int(time.time())) point = insert_database( 'inspection', tablename=todat, editor=new) point.update() return
def to_ot_rawdata_judgement_court_gov_cn_old(self, old_data, todat): """分析方法,开始对数据进行分析 目前只更新中间库的原被告和律师律所。 其他信息并未更新。 如有其他需求!可在中间库更新至结果库时进行进一步的分析。 """ analy = Analyse() analy.text = old_data.content_all new = Otraw() _header, part, content, case_sign = analy.split_to_four_parts() clients_attr, lawyers_attr = analy.guess_clients_lawyers( part.split('\n')) clients_attr[u'原告'] = list(set(clients_attr[u'原告'])) clients_attr[u'被告'] = list(set(clients_attr[u'被告'])) lawyers_attr[u'原告'] = list(set(lawyers_attr[u'原告'])) lawyers_attr[u'被告'] = list(set(lawyers_attr[u'被告'])) new.clients_attr = clients_attr new.lawyers_attr = lawyers_attr #plaintiff = ';'.join(u"%s:%s:%s" % client for client in clients_attr[u'原告']) #defendant = ';'.join(u"%s:%s:%s" % client for client in clients_attr[u'被告']) #plaintiff_lawyers = ';'.join(u"%s:%s" % lawyer for lawyer in lawyers_attr[u'原告']) #defendant_lawyers = ';'.join(u"%s:%s" % lawyer for lawyer in lawyers_attr[u'被告']) #new.lawyer_name = '' #new.firm_name = '' lawyers = [] frim = [] for item in lawyers_attr: for x, y in lawyers_attr[item]: if x: lawyers.append(x.strip()) if y: frim.append(y.strip()) # print new.lawyer_name, new.firm_name new.lawyer_name = ",".join(lawyers) new.firm = ",".join(frim) print old_data.id, old_data.url, new.lawyer_name, new.firm new.id = old_data.id point = insert_database('Judgmentold', tablename=todat, editor=new) point.update()
def to_ot_judgment_diffed(self, old, todat): """选择to库,此处可以是个任意库,但是必须对应配置文件中的库. 此前该方法作为验证正确性使用,现在用于更新使用 """ # if old.status in ['-1', '3', '1'] or (not old.url): # return print old.id, old.url olds = Otraw #diff = ot_judgment_diffed() print "[Analysis to %s] time: %s " % (todat, time.strftime('%y-%m-%d %H-%M-%S')) if ('openlaw' in old.url): self.point.set_tablename(Open) self.point.set_filter(filter='id = %s' % old.parent_id, limit=1) else: self.point.set_tablename(olds) self.point.set_filter(filter='id = %s' % old.parent_id, limit=1) old_data = self.point.query() if not old_data: return old_data = old_data[0] analysis_data = self.fuzzy_analyse(old_data) """将此处代码注释去掉,可对验证正确性库进行写入数据 new_plain_lawyer = {'Success': old.plaintiff_lawyers, 'old': plaintiff_lawyers, 'new': analysis_data[1][1] } new_defen_lawyer = {'Success': old.defendant_lawyers, 'old': defendant_lawyers, 'new': analysis_data[2][1] } plaintiff = ';'.join(u"%s:%s:%s" % client for client in old_data.clients_attr[u'原告']) defendant = ';'.join(u"%s:%s:%s" % client for client in old_data.clients_attr[u'被告']) new_plain_people = {'Success': old.plaintiff, 'old': plaintiff, 'new': analysis_data[1][0] } new_defen_people = {'Success': old.defendant, 'old': defendant, 'new': analysis_data[2][0] } diff.case_sign = analysis_data[0][3] diff.header = analysis_data[0][0] diff.content = analysis_data[0][2] diff.firsthead = analysis_data[0][1] diff.defendant = json.dumps(new_defen_people) diff.defendant_lawyers = json.dumps(new_defen_lawyer) diff.plaintiff = json.dumps(new_plain_people) diff.plaintiff_lawyers = json.dumps(new_plain_lawyer) diff.judgmentid = old.id diff.url = old.url point = insert_database('Judgment', tablename = todat, editor = diff) #设置添加数据 point.insert() #添加数据 """ Update = ot_judge_base() # if (not old.chief_judge and not old.judge and not old.acting_judges ) or \ # (u'事务所' not in analysis_data[1][1] and u'事务所' not in analysis_data[2][1]): # : 删除信息 # print 'Delete new Analy %s' % old.id # point = insert_database('Judgment', tablename = ot_judge_base, filter = 'id = %s' % old.id) # point.delete() # pass #: 更新CASE_SIGN case_sign = analysis_data[0][3].split('\n') Update.case_sign = '<p>' + '</p><p>'.join(case_sign) + '</p>' # else: Pules = {} #: 更新案由信息 anyou = [] if old.type == u'行政': #: 检查标题 anyou.extend(filter(lambda x: x in analysis_data[0][0], ACTIONS)) if not anyou: #: 检查第一行 anyou.extend(filter( lambda x: x in "".join(analysis_data[0][2].split('\n')[:1]), ACTIONS)) else: anyou.extend(filter( lambda x: x in "".join(analysis_data[0][2].split('\n')[:1]), ACTIONS)) # if not anyou: anyou.extend(filter(lambda x: x in analysis_data[0][0], ACTIONS)) for item in anyou: Pules[len(item)] = item if Pules: anyou = Pules[max(Pules)] if anyou: Update.anyou_id = ACTION_ID[anyou.strip()] if ANYOU_ALIAS.has_key(anyou): Update.anyou = ANYOU_ALIAS[anyou] else: Update.anyou = anyou else: Update.anyou = '' Update.anyou_id = 0 # print Update.anyou_id area_item = AREA.ident(old.department.replace(u'县', '').replace(u'自治区', '') .replace(u'管城回族区', '管城回区').encode('gbk')) if area_item: Update.areacode = area_item['areano'] else: area_item = AREA.ident( old.department.replace(u'市', '').replace(u'区', '').replace(u'省', '').encode('gbk')) if area_item: Update.areacode = area_item['areano'] # print 'Update new Analysis' Update.defendant = analysis_data[2][0] Update.defendant_lawyers = analysis_data[2][1] Update.plaintiff = analysis_data[1][0] Update.plaintiff_lawyers = analysis_data[1][1] Update.id = old.id #: 更新审判人员信息 case_sign_key = analysis_data[3] Update.chief_judge = ",".join(case_sign_key[u'审判长']) Update.acting_judges = ",".join(case_sign_key[u'代理审判员']) Update.judge = ",".join(case_sign_key[u'审判员']) Update.clerk = ",".join(list(set(case_sign_key[u'书记员']))) head_key = analysis_data[4] Update.department = head_key['department'] Update.case_number = head_key['case_number'] Update.type = head_key['type'] Update.title = head_key['title'] #: 分析裁判时间 Update.end_date = arrow.get( analysis_data[6], 'Asia/Shanghai').timestamp #: 敏感信息 Update.replace_data = analysis_data[5] #: 检查敏感信息 #: for item in REPLACE: #: if item in old_data.content_all: #: Update.replace_data[item] = '****' Update.replace_data = json.dumps(Update.replace_data) #: 开始检查数据 正确性 ero = JudgmentCheck.Judgment_checking( analysis_data[0][0], analysis_data[0][1], analysis_data[0][2], analysis_data[0][3]) #: 验证案由 ero.Checking_anyou(Update.anyou, old.type) #: 验证地区 ero.Checking_area(Update.areacode, old.department) #: 验证原告被告 ero.Checking_people(Update.plaintiff, Update.defendant) #: 验证原、被告律师 ero.Checking_lawyer(Update.plaintiff_lawyers, Update.defendant_lawyers) #: 验证署名 #: 署名可以添加详细署名 #:如 #: keys = {'judge': old.judge} #: ero.Checking_sign(keys) #: 这样就可以验证详细的署名信息 ero.Checking_sign() #: 审理机构检查 ero.Checking_department(old.department.decode('utf8')) if not ero.errors: for attr in old.__dict__.keys(): if not getattr(old, attr) or str(getattr(old, attr)).strip() == u'无': if (not getattr(Update, attr)) or str(getattr(Update, attr)).strip() == '': setattr(Update, attr, u'无') Update.come_from = 'Update_Judgment_Checking_Success' Update.base_check = 1 Update.status = 2 else: Update.come_from = 'Update_Judgment_Checking_Faild' Update.status = 0 Update.base_check = 0 point = insert_database( 'Judgment', tablename=ot_judge_base, editor=Update) code = point.update() if code == 1062: #: 存在重复数据 print "Delte From id = %s, table = %s" % (Update.id, ot_judge_base.__name__) point.set_filter('id = %s' % old.id) point.delete() return
def to_ot_rawdata_judgement_court_gov_cn_old(self, old, todat): new = ot_rawdata_judgement_court_gov_cn_old() new.url = old.url new.referer = old.url analy = Analyse() try: raw_html = XPath(old.source_data).execute( '//*[@id="ws"]/table')[0].to_html() except IndexError: print '[Error] Analyse: url = %s' % new.url # Request Get for item in PROXY: r = requests.get( new.url, proxies={'http': 'http:%s:59274' % item}, timeout=30) if r.ok: break if not r.ok: raise Exception,\ 'Get faild url = %s' % old.url to = old.__class__() to.id = old.id to.source_data = r.text raw_html = XPath(to.source_data).execute( '//*[@id="ws"]/table')[0].to_html() point = insert_database( 'Judgment', tablename=to.__class__, editor=new) point.update() return text = html_to_text(HTML_PARSER.unescape(raw_html)) try: text = re.sub('//W3C//DTD HTML 4.0 Transitional//EN\'>', '', text) except: pass analy.text = text new.content_all = analy.text _header, part, content, case_sign = analy.split_to_four_parts() new.clients_attr, new.lawyers_attr = analy.guess_clients_lawyers( part.split('\n')) end_date = analy.guess_end_date(case_sign) new.end_date = end_date case_sign_key = analy.guess_case_sign(case_sign.split('\n')) head_key = analy.guess_header_types(_header.split('\n')) new.content = part + content new.case_sign = case_sign new.case_number = head_key['case_number'] new.department = head_key['department'] new.type = head_key['type'] new.title = head_key['title'] new.case_type = head_key['case_type'] new.procedure = new.procedure or analy.guess_procedure(new.case_number) new.replace_data = json.dumps(analy._replace_data(part)) new.chief_judge = ",".join(case_sign_key[u'审判长']) new.acting_judges = ",".join(case_sign_key[u'代理审判员']) new.judge = ",".join(case_sign_key[u'审判员']) new.clerk = ",".join(list(set(case_sign_key[u'书记员']))) new.input_time = arrow.now().timestamp # if (not new.chief_judge and not new.judge and not new.acting_judges.strip()) or \ # (u'事务所' not in new.plaintiff_lawyers and u'事务所' not in new.defendant_lawyers): # return new.parent_id = old.id print 'Runing String <ot_rawdata_judgement_court_gov_cn_old> parent_id = %s , url = %s' % (old.id, old.url) point = insert_database( 'Judgment', tablename=ot_rawdata_judgement_court_gov_cn_old, editor=new) point.insert()
def insert_base(old, Update_id=0): area = Area_duct() actions_id, actions, replace, anyou_alias = anyou_replace("Judgment") new = ot_judge_base() for attr in ("content", "case_sign", "case_type", "department", "end_date"): if getattr(old, attr) is None: print >> __output__, u"【提示】%s 字段为空,请检查数据" % attr return # if old.case_type not in self.case_mode: # return for mode in case_mode: if mode in old.case_type: new.case_type = mode if not new.case_type: print >> __output__, u"【提示】文书字号为空" return if old.content == "": print >> __output__, "u你所访问的数据为空" new.content = "<p>" + "</p><p>".join(old.content.split("\n")) + "</p>" new.content_md5 = md5(new.content.encode("utf8")).hexdigest() new.case_sign = "<p>" + "</p><p>".join(old.case_sign.split("\n")) + "</p>" new.case_number = old.case_number new.type = new.case_type[:-3] #: 如果是仲裁,那属于民事 if new.type == u"仲裁": new.type = u"民事" new.title = old.title if not new.title: new.title = old.content_all.split("\n")[0] Pules = {} #: 更新案由信息 if new.type == u"行政": anyou = filter(lambda x: x in new.title, actions) if not anyou: anyou = filter(lambda x: x in old.content.split("\n")[0], actions) if anyou: anyou = anyou[0] else: for item in anyou: Pules[len(item)] = item else: anyou = filter(lambda x: x in old.content.split("\n")[0], actions) if not anyou: anyou = filter(lambda x: x in new.title, actions) for item in anyou: Pules[len(item)] = item else: anyou = anyou[0] if Pules: anyou = Pules[max(Pules)] new.anyou_id = actions_id[anyou.strip()] new.anyou = anyou # new.anyou_id = actions_id[new.anyou] new.department = old.department new.chief_judge = old.chief_judge new.judge = old.judge if old.acting_judges: new.acting_judges = old.acting_judges else: new.acting_judges = "无" new.clerk = old.clerk new.plaintiff = ";".join(u"%s:%s:%s" % client for client in old.clients_attr[u"原告"]) new.plaintiff_lawyers = ";".join(u"%s:%s" % lawyer for lawyer in old.lawyers_attr[u"原告"]) print >> __plai_people__, u"原告:%s, 律师:%s" % (new.plaintiff, new.plaintiff_lawyers) new.defendant = ";".join(u"%s:%s:%s" % client for client in old.clients_attr[u"被告"]) new.defendant_lawyers = ";".join(u"%s:%s" % lawyer for lawyer in old.lawyers_attr[u"被告"]) print >> __defan_people__, u"被告:%s, 律师:%s" % (new.defendant, new.defendant_lawyers) new.procedure = old.procedure new.end_date = arrow.get(old.end_date, "Asia/Shanghai").timestamp # 分析地区 area_item = area.ident(new.department.encode("gbk")) if area_item and area_item.get("staut") == "timed out": new.areacode = area_item["areano"] new.url = old.referer print >> __infomation__, old.case_sign new.replace_data = json.dumps(old.replace_data) # dic = {} # for k, v in old.replace_data.iteritems(): # if not re.match(ur".*(某|X|x|\*).*", k): # dic.update({k:v}) # new.replace_data = json.dumps(dic) new.input_time = arrow.now().timestamp if (not new.chief_judge and not new.judge and not new.acting_judges.strip()) or ( u"事务所" not in new.plaintiff_lawyers and u"事务所" not in new.defendant_lawyers ): print >> __output__, u"不存在事务所或者署名信息" return print old.id if Update_id != 0: new.id = Update_id new.url = old.url point = insert_database("Judgment", tablename=ot_judge_base, editor=new) point.update() else: point = insert_database("Judgment", tablename=ot_judge_base, editor=new) point.insert()
def web_port(case_number="", title="", depart="", datasource=None): """WEB API POST POST: http://192.168.1.118/api/v1.0/judgment/ case_number => case_number, title => title, depart => department, datasource => content Returns: JSON( { "result": { "People": { "Plain": "", "defen": "" }, "danger": "", "error": "Success", "sign": "", "status": 1 } }) """ if datasource: new = test_old() analy = JudgmentProcesser() analy.fuzzy_analyse(new, datasource) try: new.clients_attr[u"原告"] = list(set(new.clients_attr[u"原告"])) new.clients_attr[u"被告"] = list(set(new.clients_attr[u"被告"])) new.lawyers_attr[u"原告"] = list(set(new.lawyers_attr[u"原告"])) new.lawyers_attr[u"被告"] = list(set(new.lawyers_attr[u"被告"])) plaintiff = ";".join(u"%s:%s:%s" % client for client in new.clients_attr[u"原告"]) defendant = ";".join(u"%s:%s:%s" % client for client in new.clients_attr[u"被告"]) plaintiff_lawyers = ";".join(u"%s:%s" % lawyer for lawyer in new.lawyers_attr[u"原告"]) defendant_lawyers = ";".join(u"%s:%s" % lawyer for lawyer in new.lawyers_attr[u"被告"]) except: return {"error": datasource} point = insert_database("Sqlextend", tablename=test_old, editor=new) # 设置添加数据 try: point.insert() # 添加数据 except Exception, e: return {"error": e} print plaintiff print defendant print plaintiff_lawyers print defendant_lawyers return { "People": {u"原告": plaintiff, u"被告": defendant}, "lawyers": {u"原告": plaintiff_lawyers, u"被告": defendant_lawyers}, "case_number": new.case_number, "title": new.title, "sign": new.case_sign, "type": new.case_type, "error": 0, }
def to_ot_judgment_inspection(self, old, todat): new = ot_judge_inspection() new.judgmentid = old.id people = [u'原告', u'被告', u'原告律师', u'被告律师', u'委托代理人', u'', u'上诉人', u'被上诉人'] content = [] for item in old.content.replace("<p>", '').split("</p>"): content.append(item) if (u'一案' in item) or (u'诉状' in item) or (u'提起公诉' in item) or \ (u'检察院指控' in item) or (u'立案执行' in item) or (u'提起诉讼' in item) or (u'提起上诉' in item) \ or (u'诉至本院' in item) or (u'本院受理' in item): break lawyer = {u'plai': [], u'defen': []} lawyer_per = {u'plai': [], u'defen': []} frims = [] checking = {u'原告': 0, u'被告': 0} for item in content: # if re.match(u'代理人|辩护人|律师|律所|顾问', item): if u'代理人' in item or u'辩护人' in item or u'律师' in item: frim = self._check_lawyer_in_line(item) if checking[u'原告'] == 1: for firm in frim: lawyer['plai'].append(firm[0] + ':' + firm[1]) frims.append(firm[1]) checking[u'原告'] = 0 elif checking[u'被告'] == 1: for firm in frim: lawyer['defen'].append(firm[0] + ':' + firm[1]) frims.append(firm[1]) checking[u'被告'] = 0 elif re.search(u'^原告|^上诉人|^申诉人|起诉人|^公诉机关|申请再审人|再审申请人|^申请人|^第.原告|^原告.:', item): checking[u'原告'] = 1 law = self._check_client_in_line(item) if type(law) is list: for pl in law: lawyer_per['plai'].append(pl[0]) else: lawyer_per['plai'].append(law[0]) if law[1] == u'机构': frims.append(law[0]) elif re.search(u'被上诉人|被申诉人|被起诉人|^被告|被申请人|^原公诉机关|^第.被告|^被告.:', item): checking[u'被告'] = 1 law = self._check_client_in_line(item) if type(law) is list: for pl in law: lawyer_per['defen'].append(pl[0]) else: lawyer_per['defen'].append(law[0]) if law[1] == u'机构': frims.append(law[0]) lawyer_per['plai'] = list(set(lawyer_per['plai'])) lawyer_per['defen'] = list(set(lawyer_per['defen'])) lawyer['plai'] = list(set(lawyer['plai'])) lawyer['defen'] = list(set(lawyer['defen'])) frims = list(set(frims)) new.people = json.dumps(lawyer_per) new.lawyers = json.dumps(lawyer) new.frims = json.dumps(frims) # 分析署名 #new.case_sign = old.case_sign sp_people = [u'审判员', u'审判长', u'人民陪审员', u'书记员', u'代理审判员', u'代书记员', u'见习书记员', u'代理书记员'] #people_key = dict(zip(sp_people, ['', '', '', '', '', '', ''])) sp_all = [] case_sign = old.case_sign.replace('<p>', '').split('</p>') for item in case_sign: for pl in sp_people: if item.startswith(pl): #people_key[pl] = item.decode('utf8').replace(pl,'') sp_all.append(item.replace(pl, '')) new.case_sign = ",".join(sp_all) new.firm = ",".join(frims) point = insert_database( 'Judgment', tablename=todat, editor=new) # 设置添加数据 point.insert() # 添加数据 return
def Querst_request(self): """搜索实例 """ point = insert_database('Sqlextend', tablename=ot_baidu_search_info) for key in self.keys: # self.firefox.get(self.url) for i in range(0, 20): self.set_pn(key, i) while True: try: self.firefox.get(self.url) break except: # self.firefox.quit() self.reset_firefox() continue data = self.firefox.page_source if data: xhtml = html.document_fromstring(data) content = zip(xhtml.xpath('//div[@id="content_left"]//div[@class="f13"]//div[@class="c-tools"]'), xhtml.xpath('//div[@id="content_left"]//div[@class="f13"]//span[@class="g"]')) for title, url in content: db = ot_baidu_search_info() try: db.title = json.loads( title.get('data-tools'))['title'].encode('utf8') except: try: db.title = title.get( 'data-tools').split(':')[1].split(',')[0].replace('"', '').encode('utf8') except IndexError: pass db.url = url.text_content().encode('utf8') db.key = key insert_database( 'Sqlextend', tablename=ot_baidu_search_info, editor=db) point.set_value(db) point.insert() """ for item in xhtml.xpath('//div[@id="content_left"]//div[@class="f13"]'): #print item.get('href'), item.text_content().encode('utf8') db = ot_baidu_search_info() import pdb pdb.set_trace() db.title = item.xpath('//h3//a')[0].title db.url = item.xpath('//span[@class="g"]')[0].text_content #db.url = item.get('href') db.key = key insert_database('Sqlextend', tablename = ot_baidu_search_info, editor = db) point.set_value(db) point.insert() """ time.sleep(2) self.firefox.close()