def parse_html(self,links): # 连接数据库 db,cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://' + re.findall('//(.*?)/',self.url)[0] + i text,html= ktgg.request_dis(url) if text == '': continue # 提取一些信息 d['posttime'] = re.findall('发布时间(.*?)<',html)[0].replace(':','').strip() d['court'] = '长沙市望城区人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<",html)[0] d['province'] = '湖南省' # 防止body为空,如果为空则为标题 for i in self.tihuan: text = text.replace(i,'') d['body'] = text if text == '': d['body'] = d['title'] self.parse_text(d,db,cursor) # 关闭数据库 ktgg.clo_mysql(db,cursor)
def parse_html(self,links): # 连接数据库 db,cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://bhqfy.chinacourt.gov.cn' + i text,html= ktgg.request_dis(url) if text == 0: continue # 提取一些信息 d['posttime'] = re.findall('发布时间(.*?)<',html)[0].replace(':','').strip() d['court'] = '北湖人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<",html)[0] d['province'] = '湖南省' if text == '': text = d['title'] html = etree.HTML(html) self.parse_text(text,html,d,db,cursor) # 关闭数据库 ktgg.clo_mysql(db,cursor)
def parse_html(self, links): # 连接数据库 db, cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://zzxfy.chinacourt.gov.cn' + i text, html = ktgg.request_dis(url) if text == 0: continue # 提取一些信息 d['posttime'] = re.findall('发布时间(.*?)<', html)[0].replace(':', '').strip() d['court'] = '湖南省渌口区人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<", html)[0] d['province'] = '湖南省' # 文本不存在就用标题替代文本 if text == '': text = d['title'] # 做一个特殊的处理,删除这两条信息(一个非开庭公告,一个内容为表格形式) if '保护当事人的诉讼权利' in d['title']: continue if '2012年8月1日至8月31日' in d['title']: continue self.parse_text(text, d, db, cursor) # 关闭数据库 ktgg.clo_mysql(db, cursor)
def parse_html(self, links): # 连接数据库 db, cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://hnyzfy.chinacourt.gov.cn' + i text, html = ktgg.request_dis(url) if text == 0: continue # 提取一些信息 d['posttime'] = re.findall('发布时间(.*?)<', html)[0].replace(':', '').strip() d['court'] = '宜章人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<", html)[0] if d['title'] == '': t = etree.HTML(html) d['title'] = t.xpath('//div[@class="b_title"]/span/text()')[0] d['province'] = '湖南省' self.parse_text(text, d, db, cursor) # 关闭数据库 ktgg.clo_mysql(db, cursor)
def parse_html(self, links): # 连接数据库 db, cursor = ktgg.con_mysql() for i in links: d = {} url = 'http://sfqfy.chinacourt.gov.cn' + i text, html = ktgg.request_dis(url) if text == '': continue # 提取一些信息 d['posttime'] = re.findall('发布时间:(.*?)<', html)[0].strip() d['court'] = '石峰区人民法院' d['source'] = self.url d['url'] = url d['title'] = re.findall("'b_title'>(.*?)<", html)[0] d['province'] = '湖南省' self.parse_text(text, d, db, cursor) # 关闭数据库 ktgg.clo_mysql(db, cursor)