示例#1
0
    def parse_html(self, html):
        # 解析html

        # 生成文件路径
        t_way = self.task_id + str(time.time()) + '.txt'
        # 生成文件路径
        file_out(t_way, str(html.encode("utf8")))

        doc = pq(html)
        lis = doc('div.doclist tr').items()
        object_list = list()
        x_lis = list()
        for x in lis:
            x_lis.append(x)
        text_lis = list()
        for i in x_lis[1:]:
            text_lis = list()
            for text in i('td').items():
                text_lis.append(text.text())
            item = dict()
            item["taskid"] = self.task_id
            item["bulletin_way"] = t_way
            item["court_num"] = text_lis[0]
            item["court_pur"] = text_lis[1]
            item["court_part"] = text_lis[2]
            item["start_court_t"] = text_lis[3]
            item["court_end_t"] = text_lis[4]
            item["court_status"] = text_lis[5]
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # # 返回对象列表和总页数
        return object_list
示例#2
0
 def parse_list(self, json_data, form):
     # 解析获取到的json
     log.info("开始解析{}第{}页".format(self.site_name, (form['pageno'])))
     t_way = self.task_id + str(time.time()) + '.txt'
     file_out(t_way, str(json_data))
     object_list = list()
     case_list = json_data["list"]
     for case in case_list:
         item = dict()
         item["taskid"] = self.task_id
         item["court_y"] = get_content(case.get("FY"))  # 法院
         item["court_t"] = get_content(case.get("FT"))  # 法庭
         item["start_court_t"] = get_content(case.get("KTRQSTRING"))  # 开庭日期
         item["court_num"] = get_content(case.get("AH"))  # 案号
         item["court_case"] = get_content(case.get("AY"))  # 案由
         item["trial_cause"] = get_content(case.get("SPZ")).strip()  # 审判人员
         item["site_name"] = self.site_name  # 网站名称
         item['bulletin_way'] = t_way
         item["undertake_dep"] = get_content(case.get("CBBM"))
         item["plaintiff"] = get_content(case.get("YG")).replace("原告:", "")
         item["defendant"] = get_content(case.get("BG")).replace("被告:", "")
         item["schedule_time"] = get_content(case.get("PQRQ"))
         b = BulletinCourt(**item)
         object_list.append(b)
     return object_list
示例#3
0
    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page_lis = doc('a').items()
        for pag in page_lis:
            if pag.text() == "尾页":
                total_page = "".join(re.findall("(\d*.shtml)",
                                                pag.attr.href)).replace(
                                                    ".shtml", "")
        lis = doc('div.font14 li').items()
        # 创建对象列表
        object_list = list()
        for x in lis:
            # 创建item字典
            item = dict()
            item["release_date"] = x('span.right').text()
            self.http.http_session("http://hunanfy.chinacourt.org" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 将获取的html写入文件
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.detail')
            item["taskid"] = self.task_id
            item["title"] = content('div.detail_bigtitle').text()
            item["court_y"] = "".join(
                re.findall("在.*法院",
                           content('div.detail_txt').text())).replace("在", "")
            item["court_t"] = "".join(
                re.findall("刑.*庭",
                           content('div.detail_txt').text()))
            item["start_court_t"] = "".join(
                re.findall("本院定于\d{4}年.{1,5}日",
                           content('div.detail_txt').text())).replace(
                               "年", "-").replace("月",
                                                 "-").replace("日", "").replace(
                                                     "本院定于", "")
            item["court_num"] = "".join(
                re.findall("审理.*号",
                           content('div.detail_txt').text())).replace(
                               "审理", "")
            item["trial_cause"] = "".join(
                re.findall("合议庭成员.*\s",
                           content('div.detail_txt').text())).replace(
                               "合议庭成员:", "").replace("\n", "")
            item["court_part"] = "".join(
                re.findall("在.*法院",
                           content('div.detail_txt').text())).replace("在", "")
            item['site_name'] = self.site_name

            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
示例#4
0
 def parse(self):
     log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page='1'))
     self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers)
     r = self.http.parse_html()
     log.info('解析新疆法院诉讼服务网第{page}页信息'.format(page='1'))
     p_list = self.parse_list(r)
     b_list = list()
     for p in p_list:
         d_url = p['det_url']
         log.info('开始抓取新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1)))
         self.http.http_session(d_url, 'get', headers=self.http.headers)
         det_mess = self.http.parse_html()
         log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1)))
         self.parse_info(det_mess, p)
         t_way = self.taskid + str(time.time()) + '.txt'
         file_out(t_way, p['html'])
         p['bulletin_way'] = t_way
         p.pop('det_url')
         p.pop('html')
         p['taskid'] = self.taskid
         b = BulletinCourt(**p)
         b_list.append(b)
     log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page='1'))
     self.mysql_client.session_insert_list(b_list)
     self.mysql_client.session_commit()
     p_total = self.page_total(r)
     for total in range(2, p_total):
         try:
             log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page=str(total)))
             self.http.http_session(self.url.format(page=str(total)), 'get', headers=self.http.headers)
             r = self.http.parse_html()
             log.info('解析重新疆法院诉讼服务网第{page}页信息'.format(page=str(total)))
             p_list = self.parse_list(r)
             b_list = list()
             for p in p_list:
                 d_url = p['det_url']
                 log.info('开始重新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total),
                                                                   strip=str(p_list.index(p) + 1)))
                 self.http.http_session(d_url, 'get', headers=self.http.headers)
                 det_mess = self.http.parse_html()
                 log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total),
                                                                  strip=str(p_list.index(p) + 1)))
                 self.parse_info(det_mess, p)
                 t_way = self.taskid + str(time.time()) + '.txt'
                 file_out(t_way, p['html'])
                 p['bulletin_way'] = t_way
                 p.pop('det_url')
                 p.pop('html')
                 p['taskid'] = self.taskid
                 b = BulletinCourt(**p)
                 b_list.append(b)
             log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page=str(total)))
             self.mysql_client.session_insert_list(b_list)
             self.mysql_client.session_commit()
         except Exception:
             m = traceback.format_exc()
             SpiderException(m, self.taskid, self.site_name, self.url)
     self.mysql_client.session_close()
     log.info('抓取新疆法院诉讼服务网结束')
示例#5
0
    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        total_page = int(doc('a.zt_02').text()[-3:])
        lis = doc('div.text ul li a').items()
        # 创建对象列表
        object_list = list()
        for x in lis:
            # 创建item字典
            item = dict()
            self.http.http_session(x.attr.href, "post", headers=self.headers)
            htm = self.http.parse_html()
            doc = pq(htm)
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 将获取的html写入文件
            file_out(t_way, str(htm))
            content = doc('div.text')
            item["taskid"] = self.task_id
            item["release_date"] = content('h2').text()[3:13]
            item["title"] = content('h1').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "".join(
                re.findall("(在.*院)",
                           content('h1').text())).replace("在", "")
            item["court_t"] = "".join(
                re.findall("(院.*庭)",
                           content('h1').text())).replace("院", "").replace(
                               "开庭", "")
            item["start_court_t"] = x.text()[:16]
            if u"刑事" in item["title"]:
                item["defendant"] = "".join(
                    re.findall("(审理.*)",
                               content('p').text().replace("\xa0\xa0",
                                                           ""))).replace(
                                                               "审理", "")
            else:
                item["plaintiff"] = "".join(
                    re.findall("(审理.*诉)",
                               content('p').text().replace(
                                   "\xa0\xa0",
                                   ""))).replace("审理", "").replace("诉", "")
                item["defendant"] = "".join(
                    re.findall("(诉.*等)",
                               content('p').text().replace(
                                   "\xa0\xa0",
                                   ""))).replace("诉", "").replace("等", "")
            item['site_name'] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
            # 返回对象列表和总页数
        return object_list, total_page
示例#6
0
    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        for page in doc('a').items():
            if page.text() == "末页":
                total_page = "".join(re.findall("\d{1,3}", page.attr.onclick))

        lis = doc('table.tabData a').items()
        object_list = list()
        for x in lis:
            self.http.http_session(x.attr.href, "get", headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.print-box')
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("发表日期:20\d{2}-\d{1,2}-\d{1,2}",
                           content.text())).replace("发表日期:", "")
            item["title"] = x.attr.title
            item["bulletin_way"] = t_way
            item["court_y"] = content('h3').text()
            item["court_t"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["start_court_t"] = "".join(
                re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}时\d{2})",
                           content('p').text())).replace("年", "-").replace(
                               "月", "-").replace("日", "").replace("时", ":")
            item["court_num"] = "".join(
                re.findall("(审理.*案件)",
                           content('p').text())).replace("审理",
                                                         "").replace("案件", "")
            item["court_part"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        #     break
        return object_list, total_page
示例#7
0
    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page_list = doc('a.zt_02').items()
        total_page = 10
        for page in page_list:
            if int(page.text()) > total_page:
                total_page = int(page.text())
        lis = doc('div.text ul li a').items()
        object_list = list()
        for x in lis:
            item = dict()
            self.http.http_session(x.attr.href, "get", headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.text')
            item["taskid"] = self.task_id
            item["release_date"] = content('h2').text()[3:13]
            item["title"] = content('h1').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "".join(
                re.findall("(在.*法院)",
                           content('h1').text())).replace("在", "")
            item["court_t"] = "".join(
                re.findall("(院.*庭)",
                           content('h1').text())).replace("院", "").replace(
                               "开庭", "")
            item["start_court_t"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x.text()))
            item["plaintiff"] = "".join(
                re.findall("(审理.*诉)",
                           content("p").text())).replace("审理",
                                                         "").replace("诉", "")
            item["site_name"] = self.site_name
            date = get_today_date()
            if eval("".join(re.findall("\d{4}-\d{2}-\d{2}", x.text())).replace(
                    "-", "")) > eval(date):
                # 生成文件路径
                file_out(t_way, str(htm))

                # 将item字典映射成对象
                b = BulletinCourt(**item)
                object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
示例#8
0
    def parse_html(self, html):

        doc = pq(html)
        total_page = 10
        for page in doc('a.pagination').items():
            if page.text() == ">>":
                total_page = int("".join(re.findall("\d{2,3}",
                                                    page.attr.href)))
        VIEWSTATE = doc("div.aspNetHidden input").attr.value
        lis = doc('ul.module-case-items li').items()
        object_list = list()
        for x in lis:
            self.http.http_session("https://www.fjcourt.gov.cn" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers,
                                   verify=False)
            htm = self.http.parse_html()
            doc = pq(htm)
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            content = doc('div.article-wrap')
            item = dict()
            item["taskid"] = self.task_id
            item["title"] = content('p.article-hd-title').text()
            item["bulletin_way"] = t_way
            item["court_y"] = content('span.article-author').text()
            item["court_t"] = "".join(
                re.findall("(在.*公开)",
                           content('div.article-content').text())).replace(
                               "在", "").replace("公开", "")
            item["start_court_t"] = x('span.cir-time').text().replace(
                "[", "").replace("]", "")
            item["court_part"] = "".join(
                re.findall("(在.*公开)",
                           content('div.article-content').text())).replace(
                               "在", "").replace("公开", "")
            item["site_name"] = self.site_name
            pub_time = (item["start_court_t"].replace("-", ""))
            date = get_today_date()
            if eval(pub_time) > eval(date):
                # 将item字典映射成对象
                b = BulletinCourt(**item)
                object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page, VIEWSTATE
示例#9
0
    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        total_page = 10
        for page in doc('div.turn_page a.zt_02').items():
            if int(page.text()) > total_page:
                total_page = int(page.text())
        lis = doc('ul.sswy_news li').items()
        object_list = list()
        for x in lis:
            self.http.http_session(x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.ywzw_con_inner')
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}",
                           content('p.p_source ').text()))
            item["title"] = x('a').attr.title
            item["bulletin_way"] = t_way
            item["court_y"] = content('h3.h3_title').text()
            item["court_t"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["start_court_t"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}",
                           x('a').attr.title))
            item["court_part"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["site_name"] = self.site_name

            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        #     break
        return object_list, total_page
示例#10
0
    def parse(self):
        log.info('开始抓取天津法院网')
        ct = 1
        while ct < 30:
            log.info('开始抓取天津法院网第{page}页信息'.format(page=str(ct)))
            self.http.http_session(self.url.format(page=str(ct)), 'get', headers=self.http.headers)
            try:
                r = self.http.parse_html()
                log.info('解析天津法院网第{page}页信息'.format(page=str(ct)))
                p_list = self.parse_list(r)
                ic = self.is_c(r)
                object_list = list()
                for i in p_list:
                    try:
                        log.info('开始抓取天津法院网第{page},第{strip}条信息'.format(page=str(ct),
                                                                       strip=str(p_list.index(i)+1)))
                        d_url = 'http://tjfy.chinacourt.org' + i['det_url']
                        self.http.http_session(d_url, 'get', headers=self.http.headers)
                        rl = self.http.parse_html()

                        log.info('解析天津法院网第{page},第{strip}条信息'.format(page=str(ct),
                                                                     strip=str(p_list.index(i))))
                        self.parse_info(rl, i)
                        log.info('写出天津法院网第{page},第{strip}条信息'.format(page=str(ct),
                                                                     strip=str(p_list.index(i))))
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, i['html'])
                        i['bulletin_way'] = t_way
                        i.pop('det_url')
                        i.pop('html')
                        b = BulletinCourt(**i)
                        object_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, self.url)
                log.info('存储天津法院网第{page}页数据'.format(page=str(ct), strip=str(p_list.index(i))))
                self.mysql_client.session_insert_list(object_list)
                self.mysql_client.session_commit()
                if ic == 0:
                    break
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, self.url)
            ct += 1
        self.mysql_client.session_close()
        log.info('开始抓取天津法院网结束')
示例#11
0
    def parse_html(self, html):
        # 解析html

        # 生成文件路径
        t_way = self.task_id + str(time.time()) + '.txt'
        # 生成文件路径
        file_out(t_way, str(html))
        doc = pq(html)
        total_page = "".join(
            re.findall("共.*页\s上",
                       doc('span').text().replace("\n", "")))[1:3]
        lis = doc('table.newlisttable tr').items()
        object_list = list()
        for content in lis:
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("(\(.*\))",
                           content('td').text()))[1:-1]
            item["title"] = content('a').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "湖北省高级人民法院" if content(
                'p').text()[:4] == "本院定于" else content('p').text()[:4]
            item["court_t"] = "".join(
                re.findall("(在.*判庭)",
                           content('p').text())).replace("在", "")
            item["start_court_t"] = "".join(
                re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}:\d{2})",
                           content('p').text())).replace("年", "-").replace(
                               "月", "-").replace("日", "")
            item["plaintiff"] = "".join(
                re.findall("(原告:.*;)",
                           content('p').text())).replace("原告:", "")
            item["defendant"] = "".join(
                re.findall("(被告:.*的)",
                           content('p').text())).replace("被告:",
                                                         "").replace("的", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
示例#12
0
    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page = doc('div.paginationControl a').eq(5).attr.href
        total_page = "".join(re.findall("\d{1,3}", page))
        lis = doc('span.left').items()
        object_list = list()
        for x in lis:
            self.http.http_session("http://sxfy.chinacourt.org" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()

            doc = pq(htm)
            content = doc('div.detail')
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(content))
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}",
                           content('div.sth_a').text()))
            item["title"] = content('div.b_title').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "陕西省高级人民法院"
            item["court_t"] = "".join(
                re.findall("(在.{1,10}公开)",
                           content('div').text())).replace("在", "").replace(
                               "公开", "")
            item["court_part"] = "".join(
                re.findall("(在.{1,10}公开)",
                           content('div').text())).replace("在", "").replace(
                               "公开", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
示例#13
0
    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析江西庭审公开网第{}页".format(str(form['page.pageNo'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["message"]["result"]
        for case in case_list:

            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = case.get("lastBroadcastTimeString")  # 发布日期
            item["title"] = get_content(case.get("caseName"))  # 标题
            item["court_y"] = get_content(case.get("belongOrgName"))  # 法院
            item["court_t"] = get_content(case.get("openCourtAddr"))  # 法庭
            item["start_court_t"] = get_content(
                case.get("openCourtDateString"))  # 开庭日期
            item["court_num"] = get_content(case.get("caseNo"))  # 案号
            item["case_type"] = get_content(case.get("caseTypeString"))  # 案件类型
            item["court_case"] = get_content(
                case.get("causePlacedOnFile"))  # 案由
            item["trial_cause"] = get_content(
                case.get("underJustice")).strip()  # 审判人员

            try:
                dex = case["litigants"].index("被告:")
                item["plaintiff"] = case["litigants"][:dex].replace(
                    "原告:", "")[:-1]  # 原告
                item["defendant"] = case["litigants"][dex:].replace("被告:",
                                                                    "")  # 被告
            except:
                item["plaintiff"] = ""
                item["defendant"] = case.get("litigants")

            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list
示例#14
0
 def parse_list(self, json_data, form):
     # 解析获取到的json
     log.info("开始解析{}第{}页".format(self.site_name, (form['currentPageNo'])))
     t_way = self.task_id + str(time.time()) + '.txt'
     file_out(t_way, str(json_data))
     object_list = list()
     case_list = json_data["data"]
     for case in case_list:
         item = dict()
         item["taskid"] = self.task_id
         item["release_date"] = get_content(case.get("createDate"))
         item["court_y"] = get_content(case.get("belongOrgName"))  # 法院
         item["court_t"] = get_content(case.get("trialCourt"))  # 法庭
         item["start_court_t"] = get_content(case.get("courtTime"))  # 开庭日期
         item["court_num"] = get_content(case.get("caseNo"))  # 案号
         item["court_case"] = get_content(case.get("caseDesc"))  # 案由
         item["trial_cause"] = get_content(case.get("judge")).strip()  # 审判人员
         item["site_name"] = self.site_name  # 网站名称
         item['bulletin_way'] = t_way
         b = BulletinCourt(**item)
         object_list.append(b)
     return object_list
示例#15
0
    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['page'])))

        object_list = list()
        case_list = json_data["data"]
        for case in case_list:
            if "开庭公告" in html.unescape(case["ggbt"]):
                item = dict()
                item["release_date"] = case["clsj"]
                formdata = {
                    "ggsdid": "{}".format(str(case['ggsdid'])),
                    "ssfy": "{}".format(str(case['fydm']))
                }
                ur = "http://111.230.134.78:8081/sdgl/app/getGgsdInfo.do"
                self.http.http_session(ur,
                                       "post",
                                       data=formdata,
                                       headers=self.headers)
                json = self.http.parse_json()["data"]
                item["taskid"] = self.task_id
                item["release_date"] = html.unescape(json.get("CLSJ"))
                item["title"] = html.unescape(json.get("GGBT"))
                item["court_y"] = get_content(json.get("SSFYMC"))  # 法院
                content = html.unescape(json.get("GGNR"))
                t_way = self.task_id + str(time.time()) + '.txt'
                file_out(t_way, str(content))
                item["court_t"] = "".join(re.findall("法院.{1,10}庭",
                                                     content)).replace(
                                                         "法院", "")
                item["court_num"] = html.unescape(json.get("AH"))  # 案号
                item["trial_cause"] = html.unescape(
                    json.get("CBRXM").strip())  # 审判人员
                item['bulletin_way'] = t_way
                item["site_name"] = self.site_name
                b = BulletinCourt(**item)
                object_list.append(b)
示例#16
0
    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        # print(doc("td.td_pagebar").text())
        total_page = "".join(re.findall("共\s.*\s页", doc("td.td_pagebar").text())).replace(
            "共", "").replace("页", "").strip()
        lis = doc('td.td_line').items()
        object_list = list()
        for x in lis:
            if "开庭" in x.text():
                self.http.http_session("http://qhfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers)
                htm = self.http.parse_html()
                doc = pq(htm)
                content = doc
                item = dict()
                item["taskid"] = self.task_id
                item["release_date"] = "".join(re.findall("\d{4}-\d{2}-\d{2}", content("p").text()))
                item["title"] = x.text()
                t_way = self.task_id + str(time.time()) + '.txt'
                item["bulletin_way"] = t_way
                item["court_y"] = "".join(re.findall(".{2,10}人民法院", content('span.detail_content').text()))
                item["court_t"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                # item["start_court_t"] = "".join(re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title))
                item["court_part"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                item["site_name"] = self.site_name
                # print(item)
                if eval(item["release_date"].replace("-", "")) > eval("20180101"):
                    file_out(t_way, str(htm))
                    # 将item字典映射成对象
                    b = BulletinCourt(**item)
                    object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
示例#17
0
    def parse(self):
        log.info('开始抓取吉林省高级人民法院司法公开网第{page}页信息'.format(page='1'))
        self.http.http_session(self.url.format(page='1'),
                               'get',
                               headers=self.http.headers)
        r = self.http.parse_html()
        log.info('解析吉林省高级人民法院司法公开网第{page}页信息'.format(page='1'))
        doc = pq(r)
        skip = doc('div.turn_page').children('p').children('a.sp_next')
        nurl = 'http://www.jlsfy.gov.cn' + skip.attr('href').replace('&amp;', '&')\
            .replace('pagecur=1', 'pagecur={pageno}')
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = p['det_url']
                log.info('开始抓取吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)
        log.info('存储吉林省高级人民法院司法公开网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        for total in range(2, p_total):
            try:
                log.info('开始抓取林省高级人民法院司法公开网第{page}页信息'.format(page=str(total)))
                self.http.http_session(nurl.format(pageno=str(total)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html()
                log.info('解析林省高级人民法院司法公开网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = p['det_url']
                        log.info('开始林省高级人民法院司法公开网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info('解析林省高级人民法院司法公开网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)
                log.info('存储林省高级人民法院司法公开网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, self.url)

        self.mysql_client.session_close()
        log.info('抓取林省高级人民法院司法公开网结束')
示例#18
0
    def parse(self):
        log.info('开始抓取河北法院网第{page}页信息'.format(page='1'))
        self.http.http_session(self.url.format(page='1'),
                               'get',
                               headers=self.http.headers)
        r = self.http.parse_html()
        log.info('解析河北法院网第{page}页信息'.format(page='1'))
        doc = pq(r)
        skip = doc('div.turn_page').children('p').children('a.sp_next')
        nurl = 'http://hbgy.hbsfgk.org' + skip.attr('href').replace(
            'pagecur=1', 'pagecur={pageno}')
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = p['det_url']
                log.info('开始抓取河北法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析河北法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)
        log.info('存储河北法院网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        for total in range(2, p_total):
            try:
                log.info('开始抓取河北法院网第{page}页信息'.format(page=str(total)))
                self.http.http_session(nurl.format(pageno=str(total)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html()
                log.info('解析河北法院网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = p['det_url']
                        log.info('开始河北法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))

                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info('解析河北法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))

                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)
                log.info('存储河北法院网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, self.url)
            time0 = get_today_date()
            time1 = self.get_n_t(r)
            strftime0 = datetime.datetime.strptime(time1, "%Y-%m-%d")
            strftime1 = datetime.datetime.strptime(time0, "%Y-%m-%d")
            fg = strftime1 > strftime0
            if fg == True:
                break

        self.mysql_client.session_close()
        log.info('抓取河北法院网结束')
示例#19
0
    def parse(self):
        log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page='1'))
        ts = datetime.date.today()
        tm = datetime.date.today() + datetime.timedelta(days=365)
        self.http.http_session(self.url.format(end=str(tm),
                                               start=str(ts),
                                               page='1'),
                               'get',
                               headers=self.http.headers)
        r = self.http.parse_html().replace('&#9658', '')
        log.info('解析重庆法院公共服务网第{page}页信息'.format(page='1'))
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[
                    'det_url']
                log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)

        log.info('存储天津法院网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        print(p_total)
        for total in range(2, p_total):
            try:
                log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page=str(total)))
                self.http.http_session(self.url.format(end=str(tm),
                                                       start=str(ts),
                                                       page=str(total)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html().replace('&#9658', '')
                log.info('解析重庆法院公共服务网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[
                            'det_url']
                        log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)

                log.info('存储重庆法院公共服务网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()

            except Exception:
                m = traceback.format_exc()
                SpiderException(
                    m, self.taskid, self.site_name,
                    self.url.format(end=str(tm),
                                    start=str(ts),
                                    page=str(total)))
        self.mysql_client.session_close()
        log.info('抓取重庆法院公共服务网结束')
示例#20
0
    def parse(self):
        log.info('开始抓取黑龙江法院网第{page}页信息'.format(page='1'))
        ts = datetime.date.today()
        tm = datetime.date.today() + datetime.timedelta(days=365)
        self.http.http_session(self.url.format(page='1',
                                               start=str(ts),
                                               end=str(tm)),
                               'get',
                               headers=self.http.headers)
        self.http.set_charset('gb2312')
        r = self.http.parse_html()
        print(r)
        log.info('解析抓取黑龙江法院网第{page}页信息'.format(page='1'))
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p['det_url']
                log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析黑龙江法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)
            break
        log.info('存储黑龙江法院网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        for total in range(2, p_total):
            try:
                log.info('开始抓取黑龙江法院网第{page}页信息'.format(page=str(total)))
                self.http.http_session(self.url.format(page=str(total),
                                                       start=str(ts),
                                                       end=str(tm)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html()
                log.info('解析黑龙江法院网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p[
                            'det_url']
                        log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info('解析黑龙江法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)

                log.info('存储黑龙江法院网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()

            except Exception:
                m = traceback.format_exc()
                SpiderException(
                    m, self.taskid, self.site_name,
                    self.url.format(end=str(tm),
                                    start=str(ts),
                                    page=str(total)))
        self.mysql_client.session_close()
        log.info('抓取黑龙江法院网结束')