def fuzzy_analyse(self, old_data): "分析方法,开始对数据进行分析" analy = Analyse() analy.text = old_data.content_all _header, part, content, case_sign = analy.split_to_four_parts() if len(_header.split('\n')) < 4: _header = "\n".join(analy.text_in_lines[0:6]) _header = re.sub(u'日期:|法院:|案号:', '', _header) clients_attr, lawyers_attr = analy.guess_clients_lawyers( part.split('\n')) case_sign_key = analy.guess_case_sign(case_sign.split('\n')) head_key = analy.guess_header_types(_header.split('\n')) clients_attr[u'原告'] = list(set(clients_attr[u'原告'])) clients_attr[u'被告'] = list(set(clients_attr[u'被告'])) lawyers_attr[u'原告'] = list(set(lawyers_attr[u'原告'])) lawyers_attr[u'被告'] = list(set(lawyers_attr[u'被告'])) plaintiff = '' defendant = '' plaintiff_lawyers = '' defendant_lawyers = '' end_time = analy.guess_end_date(case_sign) replace_data = analy._replace_data(part) if clients_attr[u'原告']: plaintiff = ';'.join( u"%s:%s:%s" % client for client in clients_attr[u'原告']) if clients_attr[u'被告']: defendant = ';'.join( u"%s:%s:%s" % client for client in clients_attr[u'被告']) if lawyers_attr[u'原告']: plaintiff_lawyers = ';'.join( u"%s:%s" % lawyer for lawyer in lawyers_attr[u'原告']) if lawyers_attr[u'被告']: defendant_lawyers = ';'.join( u"%s:%s" % lawyer for lawyer in lawyers_attr[u'被告']) return [(_header, part, content, case_sign), (plaintiff, plaintiff_lawyers), (defendant, defendant_lawyers), case_sign_key, head_key, replace_data, end_time]
def to_ot_rawdata_judgement_court_gov_cn_old(self, old, todat): new = ot_rawdata_judgement_court_gov_cn_old() new.url = old.url new.referer = old.url analy = Analyse() try: raw_html = XPath(old.source_data).execute( '//*[@id="ws"]/table')[0].to_html() except IndexError: print '[Error] Analyse: url = %s' % new.url # Request Get for item in PROXY: r = requests.get( new.url, proxies={'http': 'http:%s:59274' % item}, timeout=30) if r.ok: break if not r.ok: raise Exception,\ 'Get faild url = %s' % old.url to = old.__class__() to.id = old.id to.source_data = r.text raw_html = XPath(to.source_data).execute( '//*[@id="ws"]/table')[0].to_html() point = insert_database( 'Judgment', tablename=to.__class__, editor=new) point.update() return text = html_to_text(HTML_PARSER.unescape(raw_html)) try: text = re.sub('//W3C//DTD HTML 4.0 Transitional//EN\'>', '', text) except: pass analy.text = text new.content_all = analy.text _header, part, content, case_sign = analy.split_to_four_parts() new.clients_attr, new.lawyers_attr = analy.guess_clients_lawyers( part.split('\n')) end_date = analy.guess_end_date(case_sign) new.end_date = end_date case_sign_key = analy.guess_case_sign(case_sign.split('\n')) head_key = analy.guess_header_types(_header.split('\n')) new.content = part + content new.case_sign = case_sign new.case_number = head_key['case_number'] new.department = head_key['department'] new.type = head_key['type'] new.title = head_key['title'] new.case_type = head_key['case_type'] new.procedure = new.procedure or analy.guess_procedure(new.case_number) new.replace_data = json.dumps(analy._replace_data(part)) new.chief_judge = ",".join(case_sign_key[u'审判长']) new.acting_judges = ",".join(case_sign_key[u'代理审判员']) new.judge = ",".join(case_sign_key[u'审判员']) new.clerk = ",".join(list(set(case_sign_key[u'书记员']))) new.input_time = arrow.now().timestamp # if (not new.chief_judge and not new.judge and not new.acting_judges.strip()) or \ # (u'事务所' not in new.plaintiff_lawyers and u'事务所' not in new.defendant_lawyers): # return new.parent_id = old.id print 'Runing String <ot_rawdata_judgement_court_gov_cn_old> parent_id = %s , url = %s' % (old.id, old.url) point = insert_database( 'Judgment', tablename=ot_rawdata_judgement_court_gov_cn_old, editor=new) point.insert()