def get_info_urls_of_reports(self, url): """ 根据url获取所有年份报告的url :param url: target url :return: page_urls list """ html_text = get_html_text(url) html = etree.HTML(html_text) urls = html.xpath('//*[@id="top_bg"]/div/div[4]/div[7]/ul/li/a/@href') return urls
def get_info_urls_of_bulletin(url): """ 爬取政府公报网页上的链接 :param url: target url :return: info_urls list """ page_content = get_html_text(url) html = etree.HTML(page_content) script_str = html.xpath('//div[@class="zx_zwgb_left"]//script/text()')[0] # print(script_str) links = script_str.split('opath.push("./')[1:] urls = list(map(lambda s: url + s.split('")')[0], links)) return urls
def get_notification_infos(self, previous_title, url): """ 在通知文件页面爬取通知的内容 :param previous_title: 往年期数 :param url: info url :return: base_infos, content, attachments """ page_content = get_html_text(url) if not page_content: return [], '', [] html = etree.HTML(page_content) # ['期数', '索引号', '省份', '城市', '文件类型', '文号', '发布机构', '发布日期', '标题', '主题词'] index = html.xpath('//div[@class="xx_con"]/p[1]/text()') aspect = html.xpath('//div[@class="xx_con"]/p[2]/text()') announced_by = html.xpath('//div[@class="xx_con"]/p[3]/text()') announced_date = html.xpath('//div[@class="xx_con"]/p[4]/text()') title = html.xpath('//div[@class="xx_con"]/p[5]/text()') document_num = html.xpath('//div[@class="xx_con"]/p[6]/text()') key_word = html.xpath('//div[@class="xx_con"]/p[7]/text()') base_infos = [[previous_title], index, ['广东省'], ['深圳市'], aspect, document_num, announced_by, announced_date, title, key_word] # base_infos = self.get_basic_info(base_infos) base_infos = list( map(lambda x: x[0].encode('utf-8') if len(x) > 0 else ' ', base_infos)) # print('This is basic info: ', base_infos) paragraphs = html.xpath('//div[@class="news_cont_d_wrap"]//p') # 段落信息 contents = [] for paragraph in paragraphs: contents.append(paragraph.xpath('string(.)').strip()) contents = '\n'.join(contents) # deal with attachments attachments = [] script_str = html.xpath('//div[@class="fjdown"]/script/text()')[0] # if there are attachments, then get attachments from script if script_str.find('var linkdesc="";') == -1: attach_names = script_str.split('var linkdesc="')[-1].split( '";')[0].split(';') attach_urls = script_str.split('var linkurl="')[-1].split( '";')[0].split(';') suffix = url.split('/')[-1] for k in range(len(attach_urls)): attach_url = url.replace(suffix, attach_urls[k].split('./')[-1]) attach_name = attach_names[k].replace('/', '-').replace( '<', '(').replace('>', ')') # print(attach_name, attach_url) attachments.append([attach_name, attach_url]) # print(contents) return base_infos, contents, attachments
def get_info_urls_of_public(self, url): """ 爬取政府文件网页上通知文件的链接 :param url: target url :return: info_urls list """ page_content = get_html_text(url) html = etree.HTML(page_content) urls = html.xpath('//div[@class="zx_ml_list"]/ul/li/div/a/@href') urls = list( map( lambda x: 'http://www.sz.gov.cn' + x.split('..')[-1] if 'http' not in x else x, urls)) return urls
def get_info_urls_of_work(self, url): """ 爬取政府文件网页上通知文件的链接 :param url: target url :return: info_urls list """ page_content = get_html_text(url) html = etree.HTML(page_content) urls = html.xpath('//div[@class="zx_ml_list"]/ul/li/span/a/@href') titles = html.xpath('//div[@class="zx_ml_list"]/ul/li/span/a//text()') urls = list( map( lambda x: url.split('index')[0] + x.split('./')[-1] if 'http' not in x else x, urls)) return urls, titles
def get_notification_infos(self, url): """ 在通知文件页面爬取通知的内容 :param url: info url :return: base_infos, content """ page_content = get_html_text(url) if not page_content: return '' html = etree.HTML(page_content) paragraphs = html.xpath('//div[@class="TRS_Editor"]//p') # 段落信息 contents = [] for paragraph in paragraphs: contents.append(paragraph.xpath('string(.)').strip()) contents = '\n'.join(contents) return contents
def get_info_urls_of_policy(self, url): """ 爬取政策解读网页上通知文件的链接 :param url: target url :return: info_urls list """ page_content = get_html_text(url) html = etree.HTML(page_content) script_list = html.xpath( '//div[@class="zx_ml_list"]/ul/li/div/script/text()') # get url from script_list, like "var _url = './201907/t20190708_18040234.htm';" urls = list( map( lambda s: 'http://www.sz.gov.cn/cn/xxgk/zfxxgj/zcjd' + s.split( 'var _url = \'.')[-1].split('\';')[0], script_list)) return urls
def get_previous_bulletin_urls(url): """ 获取往期公报的 url :param url: target url :return: option_urls list, option_titles list """ html_text = get_html_text(url) html = etree.HTML(html_text) links = html.xpath('//select[@name="select3"]/option/@value') option_titles = html.xpath('//select[@name="select3"]/option//text()')[1:] option_urls = [url] for l in range(1, len(links)): if option_titles[l].split('年')[0] == '2019': option_urls.append( url.split('2019')[0] + '2019/' + links[l].split('./')[-1]) else: option_urls.append(url.split('2019')[0] + links[l].split('./')[-1]) # print(option_urls) return option_urls, option_titles