示例#1
0
    def to_ot_judge_base(self, old):
        """ 抽取字段 """

        #import pdb
        # pdb.set_trace()

        print old.url

        # if self.p_type and old.id > self.pages:
        #    os._exit(0)

        if MODLE == "CourtGovCnMergeProcesserLawyer":
            data = {"max": old.id, "up": DATAS[2]}
            id_or_name(fname="./keyword/id.json", mode="write", indata=data)

        if MODLE == "CourtGovCnMergeProcesserLawyerUp":
            if DATAS[1] < old.id:
                print "更新模式父id已大于普通模式下的父id, 请使用非更新模式进行抽取!"
                os._exit(0)
            elif DATAS[1] >= old.id:
                data = {"max": DATAS[1], "up": old.id}
                id_or_name(
                    fname="./keyword/id.json", mode="write", indata=data)

        # 如果此id已抽取过, 则直接返回
        if old.id in OLD_ID:
            return

        for attr in ('content', 'case_sign', 'case_type', 'department', 'end_date'):
            if getattr(old, attr) is None:
                return

        # 只分析以下类型的裁判文书
        if old.case_type not in self.case_mode:
            return

        #import pdb
        # pdb.set_trace()
        # 如果没有匹配的律师, 则直接返回
        result = self.check_lawyer(old)
        if not result[0]:
            return
        else:
            OLD_ID.add(old.id)
            data = {"id": list(OLD_ID)}
            id_or_name(fname="./keyword/oldid.json", mode="write", indata=data)

        new = ot_judge_base()

        new.content = '<p>' + '</p><p>'.join(old.content.split('\n')) + '</p>'
        new.content_md5 = md5(new.content.encode('utf8')).hexdigest()
        new.case_sign = '<p>' + \
            '</p><p>'.join(old.case_sign.split('\n')) + '</p>'

        new.case_type = old.case_type

        for item in TYPE_ID:
            if item["name"] == new.case_type.encode("gbk"):
                new.case_type_id = item["value"]
                break

        new.type = new.case_type[:-3]

        # 如果是仲裁,那属于民事
        if new.type == u'仲裁':
            new.type = u'民事'

        new.case_number = old.case_number

        new.title = re.split("\n", old.content_all)[0].strip()

        ff = False
        title = new.title
        # 先把标题中裁判文书类型文字去掉
        for item in self.case_mode:
            if item in title:
                title = title.replace(item, "")
        for ca in self.cause_of_actions:
            if ca in title:
                new.anyou = ca
                ff = True
                break
        if not ff:
            for ca in self.cause_of_actions:
                if ca in old.content_all:
                    new.anyou = ca
                    break
                else:
                    return
        new.anyou_id = self.cause_of_action_id[new.anyou]

        new.department = old.department

        new.chief_judge = old.chief_judge

        new.judge = old.judge

        new.acting_judges = old.acting_judges

        new.clerk = old.clerk

        new.plaintiff = ';'.join(
            u"%s:%s:%s" % client for client in old.clients_attr[u'原告'])
        new.plaintiff_lawyers = ';'.join(
            u"%s:%s" % lawyer for lawyer in old.lawyers_attr[u'原告'])

        new.defendant = ';'.join(
            u"%s:%s:%s" % client for client in old.clients_attr[u'被告'])
        new.defendant_lawyers = ';'.join(
            u"%s:%s" % lawyer for lawyer in old.lawyers_attr[u'被告'])

        new.procedure = old.procedure

        new.end_date = arrow.get(old.end_date, 'Asia/Shanghai').timestamp

        # 分析地区
        area = self.area_parse.ident(new.department.encode('gbk'))
        if area:
            new.areacode = area['areano']

        new.url = old.referer

        #import pdb
        # pdb.set_trace()
        # 把某某都去掉
        # new.replace_data = json.dumps(
        #    {k: v for k, v in old.replace_data.iteritems() if not re.match(ur'.*(某|X|x|\*).*', k)}
        #)

        dic = {}
        for k, v in old.replace_data.iteritems():
            if not re.match(ur".*(某|X|x|\*).*", k):
                dic.update({k: v})
示例#2
0
from pybamboo import Bamboo

import json
from flcrawl.config import DB_URL

from flcrawl.processers.judgment.lawyer import id_or_name,  local_lawyers, lawyers


bamboo = Bamboo()
LAWYER_TYPE = (u"委托代理人", u"代理人", u"指定代理人", u"指定委托代理人", u"指定辩护人", u"辩护人")
MODLE = sys.argv[2].split(".")[-1]

#import pdb
# pdb.set_trace()
DATAS = lawyers(mode=MODLE)
OLD_ID = id_or_name(fname="./keyword/oldid.json", mode="read")
OLD_ID = set(OLD_ID["id"])


def check_case_type_id():
    """ 216库中查找案件类型的id """

    dbarg = {"sql_host": "192.168.1.216", "sql_user": "******",
             "sql_db": "judge_center", "sql_pass": "******", "sql_port": 33669}

    sql = "select name, value from ot_judge_dict"
    res, desc = dbtool.exec_sql(dbarg, sql)
    if desc and desc[0]:
        return desc

TYPE_ID = check_case_type_id()