예제 #1
0
파일: spider.py 프로젝트: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "sichuan"
        self.site_name = "四川法院司法公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"ah": "", "page": "1", "fydm": "51", "limit": "9", "nd": ""}

        url = "http://111.230.134.78:8081/sdgl/app/sdggsd_list"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    str(form['page'])))
        self.http.set_charset("unicode")
        self.http.http_session(url, "post", data=form, headers=self.headers)

        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, str(form['page'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["page"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, str(form['page'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json{},第{}页异常".format(self.site_name,
                                                     str(form['page'])),
                            self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)

                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉

                # break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json{},第{}页异常".format(self.site_name, str(form['page'])),
                self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['page'])))

        object_list = list()
        case_list = json_data["data"]
        for case in case_list:
            if "开庭公告" in html.unescape(case["ggbt"]):
                item = dict()
                item["release_date"] = case["clsj"]
                formdata = {
                    "ggsdid": "{}".format(str(case['ggsdid'])),
                    "ssfy": "{}".format(str(case['fydm']))
                }
                ur = "http://111.230.134.78:8081/sdgl/app/getGgsdInfo.do"
                self.http.http_session(ur,
                                       "post",
                                       data=formdata,
                                       headers=self.headers)
                json = self.http.parse_json()["data"]
                item["taskid"] = self.task_id
                item["release_date"] = html.unescape(json.get("CLSJ"))
                item["title"] = html.unescape(json.get("GGBT"))
                item["court_y"] = get_content(json.get("SSFYMC"))  # 法院
                content = html.unescape(json.get("GGNR"))
                t_way = self.task_id + str(time.time()) + '.txt'
                file_out(t_way, str(content))
                item["court_t"] = "".join(re.findall("法院.{1,10}庭",
                                                     content)).replace(
                                                         "法院", "")
                item["court_num"] = html.unescape(json.get("AH"))  # 案号
                item["trial_cause"] = html.unescape(
                    json.get("CBRXM").strip())  # 审判人员
                item['bulletin_way'] = t_way
                item["site_name"] = self.site_name
                b = BulletinCourt(**item)
                object_list.append(b)
        return object_list
예제 #2
0
파일: spider.py 프로젝트: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "zhejiang"
        self.site_name = "浙江法院公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {
            "pageno": "1",
            "pagesize": "10",
            "cbfy": "全部",
            "dsr": "",
            "spz": "",
            "jarq1": "",
            "jarq2": ""
        }

        url = "http://www.zjsfgkw.cn/Notice/NoticeKTSearch"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    str(form['pageno'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        #
        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, str(form['pageno'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["pageno"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, str(form['pageno'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json{},第{}页异常".format(self.site_name,
                                                     str(form['pageno'])),
                            self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)

                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉

                break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])),
                self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['pageno'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["list"]
        for case in case_list:
            item = dict()
            item["taskid"] = self.task_id
            item["court_y"] = get_content(case.get("FY"))  # 法院
            item["court_t"] = get_content(case.get("FT"))  # 法庭
            item["start_court_t"] = get_content(case.get("KTRQSTRING"))  # 开庭日期
            item["court_num"] = get_content(case.get("AH"))  # 案号
            item["court_case"] = get_content(case.get("AY"))  # 案由
            item["trial_cause"] = get_content(case.get("SPZ")).strip()  # 审判人员
            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            item["undertake_dep"] = get_content(case.get("CBBM"))
            item["plaintiff"] = get_content(case.get("YG")).replace("原告:", "")
            item["defendant"] = get_content(case.get("BG")).replace("被告:", "")
            item["schedule_time"] = get_content(case.get("PQRQ"))
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list

    def get_total_page(self, json_data):
        # 获取总页数
        try:
            total_page = json_data["total"]
            return int(total_page) // 10
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.task_id, self.site_name, json_data)
            return 0
예제 #3
0
파일: spider.py 프로젝트: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "jiangxi"
        self.site_name = "江西庭审公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        date = get_today_date()
        form = {
            'isGeneral': 'Y',
            'belongOrgId': '',
            'liveStatus': '001',
            'page.pageSize': '20',
            'page.pageNo': '1',
            'gopenCourtDate': date + ' 00:00:00',
            'page.orderBy': 'openCourtDate',
            'page.order': 'asc',
            'caseType': '',
            'searchWord': ''
        }

        url = "http://www.jxfy.gov.cn/api.do?method=ttrialliveliveinfo!listAjaxp.action"
        log.info("开始抓取==============江西庭审公开网")
        log.info("开始抓取==============江西庭审公开网,第{}页".format(
            str(form['page.pageNo'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)

        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储==============江西庭审公开网,第{}页".format(
                str(form['page.pageNo'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["page.pageNo"] = i
                    log.info("开始抓取==============江西庭审公开网,第{}页".format(i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储==============江西庭审公开网,第{}页".format(
                            str(form['page.pageNo'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json江西庭审公开网,第{}页异常".format(
                                str(form['page.pageNo'])), self.task_id, url,
                            self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json江西庭审公开网,第{}页异常".format(str(form['page.pageNo'])),
                self.task_id, url, self.site_name)

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析江西庭审公开网第{}页".format(str(form['page.pageNo'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["message"]["result"]
        for case in case_list:

            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = case.get("lastBroadcastTimeString")  # 发布日期
            item["title"] = get_content(case.get("caseName"))  # 标题
            item["court_y"] = get_content(case.get("belongOrgName"))  # 法院
            item["court_t"] = get_content(case.get("openCourtAddr"))  # 法庭
            item["start_court_t"] = get_content(
                case.get("openCourtDateString"))  # 开庭日期
            item["court_num"] = get_content(case.get("caseNo"))  # 案号
            item["case_type"] = get_content(case.get("caseTypeString"))  # 案件类型
            item["court_case"] = get_content(
                case.get("causePlacedOnFile"))  # 案由
            item["trial_cause"] = get_content(
                case.get("underJustice")).strip()  # 审判人员

            try:
                dex = case["litigants"].index("被告:")
                item["plaintiff"] = case["litigants"][:dex].replace(
                    "原告:", "")[:-1]  # 原告
                item["defendant"] = case["litigants"][dex:].replace("被告:",
                                                                    "")  # 被告
            except:
                item["plaintiff"] = ""
                item["defendant"] = case.get("litigants")

            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list

    def get_total_page(self, json_data):
        # 获取总页数
        try:
            total_page = json_data["message"]["totalPages"]
            return int(total_page)
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.task_id, self.site_name, json_data)
            return 0
예제 #4
0
class Spider(MainSpider):

    def __init__(self):
        self.task_id = "hainan"
        self.site_name = "天涯法律网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        today_date = get_today_date()
        next_year_today_date = str(int(today_date[0:4]) + 1) + today_date[4:]

        form = {
            "currentPageNo": "1",
            "pageSize": "10",
            "startDate": today_date,
            "endDate": next_year_today_date,
            "caseNo": "",
            "litigant": "",
            "judge": "",
            "caseDesc": "",
            "siteId": "f7afc746-8577-4cd4-a410-884027df5bab"
        }

        url = "http://www.hicourt.gov.cn/frontDesk/getNoticeList"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['currentPageNo'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        #
        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["currentPageNo"] = i
                    log.info("开始抓取=============={},第{}页".format(self.site_name, i))
                    self.http.http_session(url, "post", data=form, headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['currentPageNo'])
                                                                 ), self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
            self.mysql_client.session_close()
        else:
            SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['page.pageNo'])
                                                     ), self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))


    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['currentPageNo'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["data"]
        for case in case_list:
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = get_content(case.get("createDate"))
            item["court_y"] = get_content(case.get("belongOrgName"))  # 法院
            item["court_t"] = get_content(case.get("trialCourt"))  # 法庭
            item["start_court_t"] = get_content(case.get("courtTime"))  # 开庭日期
            item["court_num"] = get_content(case.get("caseNo"))  # 案号
            item["court_case"] = get_content(case.get("caseDesc"))  # 案由
            item["trial_cause"] = get_content(case.get("judge")).strip()  # 审判人员
            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list

    def get_total_page(self, json_data):
        # 获取总页数
        try:
            total_page = json_data["pages"]
            return int(total_page)
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.task_id, self.site_name, json_data)
            return 0