def to_ot_judge_base(self, old): """ 抽取字段 """ #import pdb # pdb.set_trace() print old.url # if self.p_type and old.id > self.pages: # os._exit(0) if MODLE == "CourtGovCnMergeProcesserLawyer": data = {"max": old.id, "up": DATAS[2]} id_or_name(fname="./keyword/id.json", mode="write", indata=data) if MODLE == "CourtGovCnMergeProcesserLawyerUp": if DATAS[1] < old.id: print "更新模式父id已大于普通模式下的父id, 请使用非更新模式进行抽取!" os._exit(0) elif DATAS[1] >= old.id: data = {"max": DATAS[1], "up": old.id} id_or_name( fname="./keyword/id.json", mode="write", indata=data) # 如果此id已抽取过, 则直接返回 if old.id in OLD_ID: return for attr in ('content', 'case_sign', 'case_type', 'department', 'end_date'): if getattr(old, attr) is None: return # 只分析以下类型的裁判文书 if old.case_type not in self.case_mode: return #import pdb # pdb.set_trace() # 如果没有匹配的律师, 则直接返回 result = self.check_lawyer(old) if not result[0]: return else: OLD_ID.add(old.id) data = {"id": list(OLD_ID)} id_or_name(fname="./keyword/oldid.json", mode="write", indata=data) new = ot_judge_base() new.content = '<p>' + '</p><p>'.join(old.content.split('\n')) + '</p>' new.content_md5 = md5(new.content.encode('utf8')).hexdigest() new.case_sign = '<p>' + \ '</p><p>'.join(old.case_sign.split('\n')) + '</p>' new.case_type = old.case_type for item in TYPE_ID: if item["name"] == new.case_type.encode("gbk"): new.case_type_id = item["value"] break new.type = new.case_type[:-3] # 如果是仲裁,那属于民事 if new.type == u'仲裁': new.type = u'民事' new.case_number = old.case_number new.title = re.split("\n", old.content_all)[0].strip() ff = False title = new.title # 先把标题中裁判文书类型文字去掉 for item in self.case_mode: if item in title: title = title.replace(item, "") for ca in self.cause_of_actions: if ca in title: new.anyou = ca ff = True break if not ff: for ca in self.cause_of_actions: if ca in old.content_all: new.anyou = ca break else: return new.anyou_id = self.cause_of_action_id[new.anyou] new.department = old.department new.chief_judge = old.chief_judge new.judge = old.judge new.acting_judges = old.acting_judges new.clerk = old.clerk new.plaintiff = ';'.join( u"%s:%s:%s" % client for client in old.clients_attr[u'原告']) new.plaintiff_lawyers = ';'.join( u"%s:%s" % lawyer for lawyer in old.lawyers_attr[u'原告']) new.defendant = ';'.join( u"%s:%s:%s" % client for client in old.clients_attr[u'被告']) new.defendant_lawyers = ';'.join( u"%s:%s" % lawyer for lawyer in old.lawyers_attr[u'被告']) new.procedure = old.procedure new.end_date = arrow.get(old.end_date, 'Asia/Shanghai').timestamp # 分析地区 area = self.area_parse.ident(new.department.encode('gbk')) if area: new.areacode = area['areano'] new.url = old.referer #import pdb # pdb.set_trace() # 把某某都去掉 # new.replace_data = json.dumps( # {k: v for k, v in old.replace_data.iteritems() if not re.match(ur'.*(某|X|x|\*).*', k)} #) dic = {} for k, v in old.replace_data.iteritems(): if not re.match(ur".*(某|X|x|\*).*", k): dic.update({k: v})
from pybamboo import Bamboo import json from flcrawl.config import DB_URL from flcrawl.processers.judgment.lawyer import id_or_name, local_lawyers, lawyers bamboo = Bamboo() LAWYER_TYPE = (u"委托代理人", u"代理人", u"指定代理人", u"指定委托代理人", u"指定辩护人", u"辩护人") MODLE = sys.argv[2].split(".")[-1] #import pdb # pdb.set_trace() DATAS = lawyers(mode=MODLE) OLD_ID = id_or_name(fname="./keyword/oldid.json", mode="read") OLD_ID = set(OLD_ID["id"]) def check_case_type_id(): """ 216库中查找案件类型的id """ dbarg = {"sql_host": "192.168.1.216", "sql_user": "******", "sql_db": "judge_center", "sql_pass": "******", "sql_port": 33669} sql = "select name, value from ot_judge_dict" res, desc = dbtool.exec_sql(dbarg, sql) if desc and desc[0]: return desc TYPE_ID = check_case_type_id()