예제 #1
0
    def parse_html(self,links):
        # 连接数据库
        db,cursor = ktgg.con_mysql()

        for i in links:
            d = {}
            url = 'http://' + re.findall('//(.*?)/',self.url)[0] + i
            text,html= ktgg.request_dis(url)
            if text == '':
                continue
            # 提取一些信息
            d['posttime'] = re.findall('发布时间(.*?)<',html)[0].replace(':','').strip()
            d['court'] = '长沙市望城区人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<",html)[0]
            d['province'] = '湖南省'
            # 防止body为空,如果为空则为标题
            for i in self.tihuan:
                text = text.replace(i,'')
            d['body'] = text
            if text == '':
                d['body'] = d['title']
            self.parse_text(d,db,cursor)

        # 关闭数据库
        ktgg.clo_mysql(db,cursor)
예제 #2
0
    def parse_html(self, links):
        # 连接数据库
        db, cursor = ktgg.con_mysql()
        for i in links:
            d = {}
            url = self.url + i
            while True:
                try:
                    res = requests.get(url, headers=self.headers, timeout=3.05)
                    res.encoding = 'gb18030'
                    html = res.text
                except (Timeout, ConnectionError):
                    continue
                break

            # 提取一些公共信息
            text = etree.HTML(html)
            d['court'] = '资兴区人名法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = text.xpath('//font/b/text()')[0]
            d['posttime'] = text.xpath(
                '//p[@align="center"][3]/text()')[0].split(':')[-1]
            d['province'] = '湖南省'
            self.parse_text(text, d, db, cursor)
            time.sleep(1)

        # 关闭数据库
        ktgg.clo_mysql(db, cursor)
예제 #3
0
    def parse_html(self,links):
        # 连接数据库
        db,cursor = ktgg.con_mysql()

        for i in links:
            d = {}
            url = 'http://bhqfy.chinacourt.gov.cn' + i
            text,html= ktgg.request_dis(url)
            if text == 0:
                continue
    
            # 提取一些信息
            d['posttime'] = re.findall('发布时间(.*?)<',html)[0].replace(':','').strip()
            d['court'] = '北湖人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<",html)[0]
            d['province'] = '湖南省'
            if text == '':
                text = d['title']
            html = etree.HTML(html)
            self.parse_text(text,html,d,db,cursor)

        # 关闭数据库
        ktgg.clo_mysql(db,cursor)
예제 #4
0
    def parse_html(self, links):
        # 连接数据库
        db, cursor = ktgg.con_mysql()
        for i in links:
            d = {}
            url = 'http://czyxfy.chinacourt.gov.cn' + i
            print(url)
            while True:
                try:
                    res = requests.get(url, headers=self.headers, timeout=3.05)
                    res.encoding = 'gb18030'
                    html = res.text
                except (Timeout, ConnectionError):
                    continue
                break
            # 提取一些公共信息
            text = etree.HTML(html)
            try:
                d['court'] = '永兴人名法院'
                d['source'] = self.url
                d['url'] = url
                d['title'] = text.xpath('//p[@align="center"]//b/text()')[0]
                d['posttime'] = text.xpath(
                    '//p[@align="center"]/text()')[0].split(':')[1]
                d['province'] = '湖南省'
            except IndexError:
                continue
            self.parse_text(d, db, cursor)

        # 关闭数据库
        ktgg.clo_mysql(db, cursor)
예제 #5
0
    def parse_html(self, links):
        # 连接数据库
        db, cursor = ktgg.con_mysql()

        for i in links:
            d = {}
            url = 'http://zzxfy.chinacourt.gov.cn' + i
            text, html = ktgg.request_dis(url)
            if text == 0:
                continue

            # 提取一些信息
            d['posttime'] = re.findall('发布时间(.*?)<',
                                       html)[0].replace(':', '').strip()
            d['court'] = '湖南省渌口区人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<", html)[0]
            d['province'] = '湖南省'
            # 文本不存在就用标题替代文本
            if text == '':
                text = d['title']
            # 做一个特殊的处理,删除这两条信息(一个非开庭公告,一个内容为表格形式)
            if '保护当事人的诉讼权利' in d['title']:
                continue
            if '2012年8月1日至8月31日' in d['title']:
                continue
            self.parse_text(text, d, db, cursor)

        # 关闭数据库
        ktgg.clo_mysql(db, cursor)
예제 #6
0
    def parse_html(self, links):
        # 连接数据库
        db, cursor = ktgg.con_mysql()

        for i in links:
            d = {}
            url = 'http://hnyzfy.chinacourt.gov.cn' + i
            text, html = ktgg.request_dis(url)
            if text == 0:
                continue

            # 提取一些信息
            d['posttime'] = re.findall('发布时间(.*?)<',
                                       html)[0].replace(':', '').strip()
            d['court'] = '宜章人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<", html)[0]
            if d['title'] == '':
                t = etree.HTML(html)
                d['title'] = t.xpath('//div[@class="b_title"]/span/text()')[0]
            d['province'] = '湖南省'
            self.parse_text(text, d, db, cursor)

        # 关闭数据库
        ktgg.clo_mysql(db, cursor)
예제 #7
0
    def parse_html(self, links):
        # 连接数据库
        db, cursor = ktgg.con_mysql()
        for i in links:
            d = {}
            url = 'http://zyqfy.chinacourt.gov.cn' + i
            while True:
                try:
                    res = requests.get(url, headers=self.headers, timeout=3.05)
                    res.encoding = 'gb18030'
                    html = res.text
                except (Timeout, ConnectionError):
                    continue
                break

            # 获取所有的文本内容
            text = etree.HTML(html)
            content = text.xpath('//span[@class="detail_content"]')
            if content == []:
                continue

            # 提取一些公共信息
            d['court'] = '资阳区人名法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = text.xpath('//p[@align="center"]//b/text()')[0]
            d['posttime'] = text.xpath('//p[@align="center"]/text()')[0].split(
                ':')[1]
            d['province'] = '湖南省'

            # 格式化文本
            info = []
            t = content[0].xpath('./text()')
            if t:
                info.append(t[0].replace('\xa0', ''))
            for i in content[0].xpath('./p/text()'):
                info.append(i.replace('\xa0', ''))
            for i in content[0].xpath('./font/text()'):
                info.append(i.replace('\xa0', ''))
            if info == []:
                info = [d['title']]
            self.parse_text(info, d, db, cursor)
            time.sleep(1)

        # 关闭数据库
        ktgg.clo_mysql(db, cursor)
예제 #8
0
    def parse_html(self, links):
        # 连接数据库
        db, cursor = ktgg.con_mysql()
        for i in links:
            d = {}
            url = 'http://sfqfy.chinacourt.gov.cn' + i
            text, html = ktgg.request_dis(url)
            if text == '':
                continue
            # 提取一些信息
            d['posttime'] = re.findall('发布时间:(.*?)<', html)[0].strip()
            d['court'] = '石峰区人民法院'
            d['source'] = self.url
            d['url'] = url
            d['title'] = re.findall("'b_title'>(.*?)<", html)[0]
            d['province'] = '湖南省'
            self.parse_text(text, d, db, cursor)

        # 关闭数据库
        ktgg.clo_mysql(db, cursor)