Пример #1
0
    def on_init(self):
        """ 初始化
        """

        self.cause_of_action_id = {}
        self.cause_of_actions = []
        engine = create_engine(DB_URL, echo=False)

        with engine.connect() as conn:
            old_anyou = conn.execute(
                "select anyou_name,new_id,max(level) from ot_judge_anyou_old group by anyou_name").fetchall()

            new_anyou = conn.execute(
                "select anyou_name,id,max(level) from ot_judge_anyou group by anyou_name").fetchall()

            self.replace_keywords = dict(conn.execute(
                "select keyword,keyword_replace from ot_judge_keyword_filter").fetchall()
            )

        self.cause_of_action_id.update(dict((x[0], x[1]) for x in old_anyou))
        self.cause_of_action_id.update(dict((x[0], x[1]) for x in new_anyou))

        self.cause_of_actions.extend(
            sorted(set(x[0] for x in new_anyou), lambda x, y: len(y) - len(x)))
        self.cause_of_actions.extend(
            sorted(set(x[0] for x in old_anyou), lambda x, y: len(y) - len(x)))

        # 获取地区识别模块
        self.area_parse = IdentArea('192.168.3.234', 7779, 'area')

        dbarg = {"name": "uc_area", "sql_host": "192.168.1.216", "sql_user": "******",
                 "sql_db": "user_cloud_db", "sql_pass": "******", "sql_port": 57789}

        sql = 'select id,  province, city,country, grade from uc_area'

        res, desc = dbtool.exec_sql(dbarg, sql)

        """
        self.area_parse.set_areas(
            [(result['id'], result['province'], result['city'],
            result['country'])
            for result in desc])
        """
        for result in desc:
            if result['grade'] == 2:
                self.area_parse.set_area(
                    result['country'], result['id'], 2, result['city'])
            if result['grade'] == 1:
                self.area_parse.set_area(
                    result['city'], result['id'], 1, result['province'])
            if result['grade'] == 0:
                self.area_parse.set_area(
                    result['province'], result['id'], 0, '')
Пример #2
0
class CourtGovCnMergeProcesserLawyer(JudgmentProcesser):

    """ 按好律师表中的律师名单抽取到正式库 """

    from_warehouses = (ot_rawdata_judgement_court_gov_cn_old,)
    to_warehouses = (ot_judge_base,)

    sql_ml = 'select parent_id from ot_judge_base where come_from="CourtGovCnMergeProcesserLawyer" order by parent_id desc limit 1'

    startid = DATAS

    case_mode = (u'民事判决书', u'民事调解书', u'仲裁裁决书', u'仲裁调解书', u'刑事判决书', u'行政判决书',
                 u"民事裁定书", u"国家赔偿决定书",  u"刑事裁定书", u"刑事附带民事判决书", u"刑事附带民事裁定书",
                 u"刑事附带民事调解书", u"刑事再审判决书", u"强制医疗决定书", u"行政裁定书", u"行政附带民事判决书",
                 u"行政附带民事调解书", u"行政赔偿调解书")

    def on_init(self):
        """ 初始化
        """

        self.cause_of_action_id = {}
        self.cause_of_actions = []
        engine = create_engine(DB_URL, echo=False)

        with engine.connect() as conn:
            old_anyou = conn.execute(
                "select anyou_name,new_id,max(level) from ot_judge_anyou_old group by anyou_name").fetchall()

            new_anyou = conn.execute(
                "select anyou_name,id,max(level) from ot_judge_anyou group by anyou_name").fetchall()

            self.replace_keywords = dict(conn.execute(
                "select keyword,keyword_replace from ot_judge_keyword_filter").fetchall()
            )

        self.cause_of_action_id.update(dict((x[0], x[1]) for x in old_anyou))
        self.cause_of_action_id.update(dict((x[0], x[1]) for x in new_anyou))

        self.cause_of_actions.extend(
            sorted(set(x[0] for x in new_anyou), lambda x, y: len(y) - len(x)))
        self.cause_of_actions.extend(
            sorted(set(x[0] for x in old_anyou), lambda x, y: len(y) - len(x)))

        # 获取地区识别模块
        self.area_parse = IdentArea('192.168.3.234', 7779, 'area')

        dbarg = {"name": "uc_area", "sql_host": "192.168.1.216", "sql_user": "******",
                 "sql_db": "user_cloud_db", "sql_pass": "******", "sql_port": 57789}

        sql = 'select id,  province, city,country, grade from uc_area'

        res, desc = dbtool.exec_sql(dbarg, sql)

        """
        self.area_parse.set_areas(
            [(result['id'], result['province'], result['city'],
            result['country'])
            for result in desc])
        """
        for result in desc:
            if result['grade'] == 2:
                self.area_parse.set_area(
                    result['country'], result['id'], 2, result['city'])
            if result['grade'] == 1:
                self.area_parse.set_area(
                    result['city'], result['id'], 1, result['province'])
            if result['grade'] == 0:
                self.area_parse.set_area(
                    result['province'], result['id'], 0, '')

    def check_lawyer(self,  old):
        """ 查找该裁判文书中的律师是否要匹配的 """

        #import pdb
        # pdb.set_trace()

        # 该裁判文书中的所有人姓名
        peoples = bamboo.people_name(old.content)

        pp = False
        name = ""
        for k in old.lawyers_attr.keys():
            # {原告:[(name, firm),(...)], 被告: ...}
            if not old.lawyers_attr[k]:
                continue
            for v in old.lawyers_attr[k]:
                if v[0] in DATAS[0]:
                    print v[0].encode("utf8")
                    pp = True
                    name = v[0]
                    return (pp, name)
        if not pp:
            for lawyer in DATAS[0]:
                # 先全文模糊查找该律师名是否存在, 若存在则逐行查找, 不存在则退出此次循环, 换一律师
                if lawyer in peoples:
                    # 过滤不存在律师名的行
                    lines = filter(
                        lambda line: lawyer in line, old.content.split('\n'))
                    for line in lines:
                        num = line.index(lawyer)
                        # 律师xx  xx律师
                        if u"律师" in line[num - 2:num] or u"律师" in line[len(lawyer) + num:len(lawyer) + num + 2]:
                            print lawyer.encode("utf8")
                            pp = True
                            name = lawyer
                            return (pp, name)
                        for item in LAWYER_TYPE:
                            if (item in line) and (lawyer in line[line.index(item) + len(item):line.index(item) + len(item) + len(lawyer)]):
                                print lawyer.encode("utf8")
                                pp = True
                                name = lawyer
                                return (pp, name)
                else:
                    continue
        return (pp, name)

    def to_ot_judge_base(self, old):
        """ 抽取字段 """

        #import pdb
        # pdb.set_trace()

        print old.url

        # if self.p_type and old.id > self.pages:
        #    os._exit(0)

        if MODLE == "CourtGovCnMergeProcesserLawyer":
            data = {"max": old.id, "up": DATAS[2]}
            id_or_name(fname="./keyword/id.json", mode="write", indata=data)

        if MODLE == "CourtGovCnMergeProcesserLawyerUp":
            if DATAS[1] < old.id:
                print "更新模式父id已大于普通模式下的父id, 请使用非更新模式进行抽取!"
                os._exit(0)
            elif DATAS[1] >= old.id:
                data = {"max": DATAS[1], "up": old.id}
                id_or_name(
                    fname="./keyword/id.json", mode="write", indata=data)

        # 如果此id已抽取过, 则直接返回
        if old.id in OLD_ID:
            return

        for attr in ('content', 'case_sign', 'case_type', 'department', 'end_date'):
            if getattr(old, attr) is None:
                return

        # 只分析以下类型的裁判文书
        if old.case_type not in self.case_mode:
            return

        #import pdb
        # pdb.set_trace()
        # 如果没有匹配的律师, 则直接返回
        result = self.check_lawyer(old)
        if not result[0]:
            return
        else:
            OLD_ID.add(old.id)
            data = {"id": list(OLD_ID)}
            id_or_name(fname="./keyword/oldid.json", mode="write", indata=data)

        new = ot_judge_base()

        new.content = '<p>' + '</p><p>'.join(old.content.split('\n')) + '</p>'
        new.content_md5 = md5(new.content.encode('utf8')).hexdigest()
        new.case_sign = '<p>' + \
            '</p><p>'.join(old.case_sign.split('\n')) + '</p>'

        new.case_type = old.case_type

        for item in TYPE_ID:
            if item["name"] == new.case_type.encode("gbk"):
                new.case_type_id = item["value"]
                break

        new.type = new.case_type[:-3]

        # 如果是仲裁,那属于民事
        if new.type == u'仲裁':
            new.type = u'民事'

        new.case_number = old.case_number

        new.title = re.split("\n", old.content_all)[0].strip()

        ff = False
        title = new.title
        # 先把标题中裁判文书类型文字去掉
        for item in self.case_mode:
            if item in title:
                title = title.replace(item, "")
        for ca in self.cause_of_actions:
            if ca in title:
                new.anyou = ca
                ff = True
                break
        if not ff:
            for ca in self.cause_of_actions:
                if ca in old.content_all:
                    new.anyou = ca
                    break
                else:
                    return
        new.anyou_id = self.cause_of_action_id[new.anyou]

        new.department = old.department

        new.chief_judge = old.chief_judge

        new.judge = old.judge

        new.acting_judges = old.acting_judges

        new.clerk = old.clerk

        new.plaintiff = ';'.join(
            u"%s:%s:%s" % client for client in old.clients_attr[u'原告'])
        new.plaintiff_lawyers = ';'.join(
            u"%s:%s" % lawyer for lawyer in old.lawyers_attr[u'原告'])

        new.defendant = ';'.join(
            u"%s:%s:%s" % client for client in old.clients_attr[u'被告'])
        new.defendant_lawyers = ';'.join(
            u"%s:%s" % lawyer for lawyer in old.lawyers_attr[u'被告'])

        new.procedure = old.procedure

        new.end_date = arrow.get(old.end_date, 'Asia/Shanghai').timestamp

        # 分析地区
        area = self.area_parse.ident(new.department.encode('gbk'))
        if area:
            new.areacode = area['areano']

        new.url = old.referer

        #import pdb
        # pdb.set_trace()
        # 把某某都去掉
        # new.replace_data = json.dumps(
        #    {k: v for k, v in old.replace_data.iteritems() if not re.match(ur'.*(某|X|x|\*).*', k)}
        #)

        dic = {}
        for k, v in old.replace_data.iteritems():
            if not re.match(ur".*(某|X|x|\*).*", k):
                dic.update({k: v})
        new.replace_data = json.dumps(dic)

        new.input_time = arrow.now().timestamp
        new._MASK_SYNC_V2 = datetime.now()
        new.from_host = urlparse(old.referer).hostname

        print "OK"
        # 更新该律师的裁判文书数
        if MODLE <> "CourtGovCnMake":
            sql_up = "update user_lawyer_main set count=count+1, update_datetime=%d where name='%s'" \
                % (arrow.now().timestamp, result[1].encode("utf8"))
            local_lawyers("insert", sql_up)

        return new