Пример #1
0
 def get_info_urls_of_reports(self, url):
     """
     根据url获取所有年份报告的url
     :param url: target url
     :return: page_urls list
     """
     html_text = get_html_text(url)
     html = etree.HTML(html_text)
     urls = html.xpath('//*[@id="top_bg"]/div/div[4]/div[7]/ul/li/a/@href')
     return urls
def get_info_urls_of_bulletin(url):
    """
    爬取政府公报网页上的链接
    :param url: target url
    :return: info_urls list
    """
    page_content = get_html_text(url)
    html = etree.HTML(page_content)
    script_str = html.xpath('//div[@class="zx_zwgb_left"]//script/text()')[0]
    # print(script_str)
    links = script_str.split('opath.push("./')[1:]
    urls = list(map(lambda s: url + s.split('")')[0], links))
    return urls
 def get_notification_infos(self, previous_title, url):
     """
     在通知文件页面爬取通知的内容
     :param previous_title: 往年期数
     :param url: info url
     :return: base_infos, content, attachments
     """
     page_content = get_html_text(url)
     if not page_content:
         return [], '', []
     html = etree.HTML(page_content)
     # ['期数', '索引号', '省份', '城市', '文件类型', '文号', '发布机构', '发布日期', '标题', '主题词']
     index = html.xpath('//div[@class="xx_con"]/p[1]/text()')
     aspect = html.xpath('//div[@class="xx_con"]/p[2]/text()')
     announced_by = html.xpath('//div[@class="xx_con"]/p[3]/text()')
     announced_date = html.xpath('//div[@class="xx_con"]/p[4]/text()')
     title = html.xpath('//div[@class="xx_con"]/p[5]/text()')
     document_num = html.xpath('//div[@class="xx_con"]/p[6]/text()')
     key_word = html.xpath('//div[@class="xx_con"]/p[7]/text()')
     base_infos = [[previous_title], index, ['广东省'], ['深圳市'], aspect,
                   document_num, announced_by, announced_date, title,
                   key_word]
     # base_infos = self.get_basic_info(base_infos)
     base_infos = list(
         map(lambda x: x[0].encode('utf-8')
             if len(x) > 0 else ' ', base_infos))
     # print('This is basic info: ', base_infos)
     paragraphs = html.xpath('//div[@class="news_cont_d_wrap"]//p')  # 段落信息
     contents = []
     for paragraph in paragraphs:
         contents.append(paragraph.xpath('string(.)').strip())
     contents = '\n'.join(contents)
     # deal with attachments
     attachments = []
     script_str = html.xpath('//div[@class="fjdown"]/script/text()')[0]
     # if there are attachments, then get attachments from script
     if script_str.find('var linkdesc="";') == -1:
         attach_names = script_str.split('var linkdesc="')[-1].split(
             '";')[0].split(';')
         attach_urls = script_str.split('var linkurl="')[-1].split(
             '";')[0].split(';')
         suffix = url.split('/')[-1]
         for k in range(len(attach_urls)):
             attach_url = url.replace(suffix,
                                      attach_urls[k].split('./')[-1])
             attach_name = attach_names[k].replace('/', '-').replace(
                 '<', '(').replace('>', ')')
             # print(attach_name, attach_url)
             attachments.append([attach_name, attach_url])
     # print(contents)
     return base_infos, contents, attachments
Пример #4
0
 def get_info_urls_of_public(self, url):
     """
     爬取政府文件网页上通知文件的链接
     :param url: target url
     :return: info_urls list
     """
     page_content = get_html_text(url)
     html = etree.HTML(page_content)
     urls = html.xpath('//div[@class="zx_ml_list"]/ul/li/div/a/@href')
     urls = list(
         map(
             lambda x: 'http://www.sz.gov.cn' + x.split('..')[-1]
             if 'http' not in x else x, urls))
     return urls
Пример #5
0
 def get_info_urls_of_work(self, url):
     """
     爬取政府文件网页上通知文件的链接
     :param url: target url
     :return: info_urls list
     """
     page_content = get_html_text(url)
     html = etree.HTML(page_content)
     urls = html.xpath('//div[@class="zx_ml_list"]/ul/li/span/a/@href')
     titles = html.xpath('//div[@class="zx_ml_list"]/ul/li/span/a//text()')
     urls = list(
         map(
             lambda x: url.split('index')[0] + x.split('./')[-1]
             if 'http' not in x else x, urls))
     return urls, titles
Пример #6
0
 def get_notification_infos(self, url):
     """
     在通知文件页面爬取通知的内容
     :param url: info url
     :return: base_infos, content
     """
     page_content = get_html_text(url)
     if not page_content:
         return ''
     html = etree.HTML(page_content)
     paragraphs = html.xpath('//div[@class="TRS_Editor"]//p')  # 段落信息
     contents = []
     for paragraph in paragraphs:
         contents.append(paragraph.xpath('string(.)').strip())
     contents = '\n'.join(contents)
     return contents
Пример #7
0
 def get_info_urls_of_policy(self, url):
     """
     爬取政策解读网页上通知文件的链接
     :param url: target url
     :return: info_urls list
     """
     page_content = get_html_text(url)
     html = etree.HTML(page_content)
     script_list = html.xpath(
         '//div[@class="zx_ml_list"]/ul/li/div/script/text()')
     # get url from script_list, like "var _url = './201907/t20190708_18040234.htm';"
     urls = list(
         map(
             lambda s: 'http://www.sz.gov.cn/cn/xxgk/zfxxgj/zcjd' + s.split(
                 'var _url = \'.')[-1].split('\';')[0], script_list))
     return urls
def get_previous_bulletin_urls(url):
    """
    获取往期公报的 url
    :param url: target url
    :return: option_urls list, option_titles list
    """
    html_text = get_html_text(url)
    html = etree.HTML(html_text)
    links = html.xpath('//select[@name="select3"]/option/@value')
    option_titles = html.xpath('//select[@name="select3"]/option//text()')[1:]
    option_urls = [url]
    for l in range(1, len(links)):
        if option_titles[l].split('年')[0] == '2019':
            option_urls.append(
                url.split('2019')[0] + '2019/' + links[l].split('./')[-1])
        else:
            option_urls.append(url.split('2019')[0] + links[l].split('./')[-1])
    # print(option_urls)
    return option_urls, option_titles