示例#1
0
class Spider(MainSpider):

    def __init__(self):
        self.task_id = "qinghai"
        self.site_name = "青海法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers
        self.http.set_charset("gbk")

    def parse(self):

        form = {
            "p": "1",
            "LocationID": "0700000000",
            "sub": ""
        }

        url = "http://qhfy.chinacourt.org/fygg/index.php"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, (form['p'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page)+1):
                try:
                    form["p"] = i
                    log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p'])))
                    self.http.http_session(url, "post", data=form, headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(self.site_name, (form['p'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取{},第{}页异常".format(self.site_name, (
                            form['p'])), self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, (form['p'])), self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))


    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        # print(doc("td.td_pagebar").text())
        total_page = "".join(re.findall("共\s.*\s页", doc("td.td_pagebar").text())).replace(
            "共", "").replace("页", "").strip()
        lis = doc('td.td_line').items()
        object_list = list()
        for x in lis:
            if "开庭" in x.text():
                self.http.http_session("http://qhfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers)
                htm = self.http.parse_html()
                doc = pq(htm)
                content = doc
                item = dict()
                item["taskid"] = self.task_id
                item["release_date"] = "".join(re.findall("\d{4}-\d{2}-\d{2}", content("p").text()))
                item["title"] = x.text()
                t_way = self.task_id + str(time.time()) + '.txt'
                item["bulletin_way"] = t_way
                item["court_y"] = "".join(re.findall(".{2,10}人民法院", content('span.detail_content').text()))
                item["court_t"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                # item["start_court_t"] = "".join(re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title))
                item["court_part"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                item["site_name"] = self.site_name
                # print(item)
                if eval(item["release_date"].replace("-", "")) > eval("20180101"):
                    file_out(t_way, str(htm))
                    # 将item字典映射成对象
                    b = BulletinCourt(**item)
                    object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
示例#2
0
class Spider(MainSpider):

    site_name = '黑龙江法院网'

    def __init__(self, taskid):
        MainSpider.__init__(self, task_id=taskid)
        self.http = HttpRequest(taskid, self.site_name)
        self.url = 'http://www.hljcourt.gov.cn/ktgg/index.php?p={page}&st={start}&et={end}'
        self.taskid = taskid

    def parse(self):
        log.info('开始抓取黑龙江法院网第{page}页信息'.format(page='1'))
        ts = datetime.date.today()
        tm = datetime.date.today() + datetime.timedelta(days=365)
        self.http.http_session(self.url.format(page='1',
                                               start=str(ts),
                                               end=str(tm)),
                               'get',
                               headers=self.http.headers)
        self.http.set_charset('gb2312')
        r = self.http.parse_html()
        print(r)
        log.info('解析抓取黑龙江法院网第{page}页信息'.format(page='1'))
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p['det_url']
                log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析黑龙江法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)
            break
        log.info('存储黑龙江法院网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        for total in range(2, p_total):
            try:
                log.info('开始抓取黑龙江法院网第{page}页信息'.format(page=str(total)))
                self.http.http_session(self.url.format(page=str(total),
                                                       start=str(ts),
                                                       end=str(tm)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html()
                log.info('解析黑龙江法院网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p[
                            'det_url']
                        log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info('解析黑龙江法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)

                log.info('存储黑龙江法院网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()

            except Exception:
                m = traceback.format_exc()
                SpiderException(
                    m, self.taskid, self.site_name,
                    self.url.format(end=str(tm),
                                    start=str(ts),
                                    page=str(total)))
        self.mysql_client.session_close()
        log.info('抓取黑龙江法院网结束')

    def added_parse(self):
        pass

    def parse_list(self, r):
        info_list = list()
        doc = pq(r)
        tb = doc('table tbody').children('tr').children('td')
        k = int(tb.size() / 5)
        for i in range(0, k):
            item = dict()
            title = tb.eq(i * 5 + 1).text()
            court_num = tb.eq(i * 5 + 2).text()
            court_part = tb.eq(i * 5 + 3).text()
            start_court_t = tb.eq(i * 5 + 4).text()
            det_url = tb.eq(i * 5 +
                            1).children('div').children('a').attr('href')
            item['title'] = title
            item['court_num'] = court_num
            item['court_part'] = court_part
            item['start_court_t'] = start_court_t
            item['det_url'] = det_url
            info_list.append(item)
        return info_list

    def parse_info(self, rs, item):
        doc = pq(rs)
        ct = doc('div.ggnr')
        h2 = ct('h2').text()
        h3 = ct('h3').text()
        p = ct('p').text()
        t1 = ct('div.text-01').text()
        t2 = ct('div.text-02').text()
        html = h2 + '\r\n' + h3 + '\r\n' + p + '\r\n' + t1 + '\r\n' + t2
        item['html'] = html
        item['court_y'] = h2

    def page_total(self, res):
        try:
            k = int(
                re.search('共\d*页', res).group().replace('共',
                                                        '').replace('页', ''))
            return k
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.taskid, self.site_name, '解析总页数异常')
            return 0
示例#3
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "sichuan"
        self.site_name = "四川法院司法公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"ah": "", "page": "1", "fydm": "51", "limit": "9", "nd": ""}

        url = "http://111.230.134.78:8081/sdgl/app/sdggsd_list"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    str(form['page'])))
        self.http.set_charset("unicode")
        self.http.http_session(url, "post", data=form, headers=self.headers)

        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, str(form['page'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["page"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, str(form['page'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json{},第{}页异常".format(self.site_name,
                                                     str(form['page'])),
                            self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)

                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉

                # break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json{},第{}页异常".format(self.site_name, str(form['page'])),
                self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['page'])))

        object_list = list()
        case_list = json_data["data"]
        for case in case_list:
            if "开庭公告" in html.unescape(case["ggbt"]):
                item = dict()
                item["release_date"] = case["clsj"]
                formdata = {
                    "ggsdid": "{}".format(str(case['ggsdid'])),
                    "ssfy": "{}".format(str(case['fydm']))
                }
                ur = "http://111.230.134.78:8081/sdgl/app/getGgsdInfo.do"
                self.http.http_session(ur,
                                       "post",
                                       data=formdata,
                                       headers=self.headers)
                json = self.http.parse_json()["data"]
                item["taskid"] = self.task_id
                item["release_date"] = html.unescape(json.get("CLSJ"))
                item["title"] = html.unescape(json.get("GGBT"))
                item["court_y"] = get_content(json.get("SSFYMC"))  # 法院
                content = html.unescape(json.get("GGNR"))
                t_way = self.task_id + str(time.time()) + '.txt'
                file_out(t_way, str(content))
                item["court_t"] = "".join(re.findall("法院.{1,10}庭",
                                                     content)).replace(
                                                         "法院", "")
                item["court_num"] = html.unescape(json.get("AH"))  # 案号
                item["trial_cause"] = html.unescape(
                    json.get("CBRXM").strip())  # 审判人员
                item['bulletin_way'] = t_way
                item["site_name"] = self.site_name
                b = BulletinCourt(**item)
                object_list.append(b)
        return object_list