Пример #1
0
def pingxiang2():
    url='http://pxdpc.pingxiang.gov.cn/list.asp?classid=15'
    tt = requests.get(url).content.decode('utf-8')
    pages = re.findall('每页20条, 1/(\d+)页', tt)[0]
    print(f'共{pages}页')
    for page in range(1, int(pages) + 1):
        url1=url+f'&p={page}'
        tt = requests.get(url1).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '')
        contents = re.findall('&nbsp;                    <a href="(.*?)" target="_blank">(.*?)</a></td>                  <td width="11%" class="font_hui12">\[(.*?)\]</td>', tt)
        for content in contents:
            linkurl = 'http://pxdpc.pingxiang.gov.cn/' + content[0]
            detail_res = requests.get(linkurl).content.decode('utf-8').replace('/upload/','http://pxdpc.pingxiang.gov.cn/upload/')
            Html = etree.HTML(detail_res)
            # qufen = '发改委'+Html.xpath("//table[1]/tbody/tr/td[@class='font_hui12']/a[3]")[0]  # 当前栏目
            div1 = Html.xpath("/html/body/div[5]")[0]  # text
            infocontent = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace(
                '"', ' ')  # html
            title = content[1]
            publicTime = content[2].replace('                    ','')
            select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
            if len(select)==0:
                uid = uuid.uuid4()
                Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市',
                                             publicTime=publicTime, linkurl=linkurl, title=title,
                                             dataResource='', yewuType='发改委', infoType='', infoState='', isok='',
                                             isdeal='')
                Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent)
            else:
                print(f'第{page}页标题存在')
        print(f'第{page}页已爬完')
Пример #2
0
def zhangjiakou():
    try:
        for page in range(1, 374):
            url1s = [
                f'http://www.zjk.gov.cn/syscolumn/dt/zjkyw/index.html',  # 张家口要闻
                f'http://www.zjk.gov.cn/syscolumn/dt/zjkyw/index_{page}.html',  # 张家口要闻
                f'http://www.zjk.gov.cn/bmgz_frame1.jsp?pages={page}',  # 部门工作
            ]
            for url1 in url1s:
                contents1 = requests.get(
                    url1, proxies=ipmax()).content.decode('utf-8').replace(
                        '\n', '').replace('\r', '').replace('\t', '')
                contents = [
                    re.findall(
                        '"hg" href="(.*?)" target="_blank" title="(.*?)">(.*?)</a></td>                    <td width="80" class="cdate">\[(.*?)\]</td>',
                        contents1),
                    re.findall(
                        'hg" href="(.*?)" title="(.*?)" target="_blank">(.*?)</a></td>              <td width="100" class="cdate">\[(.*?)\]</td>',
                        contents1),
                ]
                for content in contents:
                    if len(content) > 0:
                        content = content[0]
                        uu = re.findall('www.(.*?).gov', url1)[0]
                        linkurl = f'http://www.{uu}.gov.cn' + content[0].strip(
                        )
                        detail_res = requests.get(linkurl).content.decode(
                            'utf-8')
                        Html = etree.HTML(detail_res)
                        infocontent = html.unescape(
                            etree.tostring(Html,
                                           method='html').decode()).replace(
                                               "'", " ").replace('"', ' ')
                        title = content[1].strip()
                        publicTime = content[3].strip()
                        select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                        if select == None:
                            uid = uuid.uuid4()
                            Mysql.insert_xinwen_baseinfo(uid=uid,
                                                         regionCode='067000',
                                                         regionName='河北省',
                                                         areaRegion='承德市',
                                                         publicTime=publicTime,
                                                         linkurl=linkurl,
                                                         title=title,
                                                         dataResource='',
                                                         yewuType='',
                                                         infoType='',
                                                         infoState='',
                                                         isok='',
                                                         isdeal='')
                            Mysql.insert_xinwen_detailinfo(
                                uid=uid, infocontent=infocontent)
                        else:
                            print('标题存在')
    except Exception as e:
        print('蚌埠\t', e)
        return zhangjiakou()
Пример #3
0
def shijiazhuang():
    url1s = [
        # 'http://www.sjz.gov.cn/column.jsp?id=1490076462404',  # 市政要闻
        'http://www.sjz.gov.cn/column.jsp?id=1490076534390',  # 部门动态
        'http://www.sjz.gov.cn/column.jsp?id=1490076571666',  # 区县动态
    ]
    for url1 in url1s:
        tt = requests.get(url1).content.decode('gb2312')
        pages = re.findall("title='每页显示.*记录'>共.*条(\d+)页", tt)[0]
        for page in range(1, int(pages) + 1):
            url = f'{url1}&current={page}'
            contents1 = requests.get(url1).content.decode('gb2312').replace(
                '\n', '').replace('\r', '').replace('\t', '')
            contents2 = re.findall(
                '1 list_2"><ul>(.*?)/ul></div></div><div style="text-align:',
                contents1)
            contents = re.findall(
                'href="(.*?)" target="_blank"  style="line-height:30px;" title="(.*?)">(.*?)</a>&nbsp;<span class="date" style="color:#898989">(.*?)</span>',
                contents2[0])
            for content in contents:
                linkurl = 'http://www.sjz.gov.cn' + content[0]
                detail_res = requests.get(linkurl).content.decode('gb2312')
                Html = etree.HTML(detail_res)
                div = Html.xpath("/html/body/div/div[2]")[0]
                infocontent = html.unescape(
                    etree.tostring(div, method='html').decode()).replace(
                        "'", " ").replace('"', ' ')
                title = content[1]
                publicTime = content[3]
                select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                if select == None:
                    uid = uuid.uuid4()
                    Mysql.insert_xinwen_baseinfo(uid=uid,
                                                 regionCode='050000',
                                                 regionName='河北省',
                                                 areaRegion='石家庄市',
                                                 publicTime=publicTime,
                                                 linkurl=linkurl,
                                                 title=title,
                                                 dataResource='',
                                                 yewuType='',
                                                 infoType='',
                                                 infoState='',
                                                 isok='',
                                                 isdeal='')
                    Mysql.insert_xinwen_detailinfo(uid=uid,
                                                   infocontent=infocontent)
                else:
                    print('标题存在')
                print('gg')
Пример #4
0
def chengde():
    for page in range(1, 374):
        url1s = [
            f'http://www.chengde.gov.cn/col/col360/index.html?uid=1412&pageNum={page}',  # 本市要闻  1361
            # 'http://www.chengde.gov.cn/col/col361/index.html?uid=1412&pageNum={page}',    # 外媒看承德  367
            # 'http://www.chengde.gov.cn/col/col362/index.html?uid=1412&pageNum={page}',    # 外媒看承德  374
            # 'http://www.chengde.gov.cn/col/col364/index.html?uid=1412&pageNum={page}',    # 公示公告    27
        ]
        for url1 in url1s:
            contents1 = requests.get(url1).content.decode('utf-8').replace(
                '\n', '').replace('\r', '').replace('\t', '')
            contents = re.findall('pan><a (.*?)</span>', contents1)
            for content in contents:
                co = re.findall("href=\\'(.*?)\\'title=\\'(.*?)\\'target",
                                content)[0]
                co1 = re.findall(
                    'target="_blank">(.*?)</a><span class="bt-data-time"style="font-size:14px;">\[(.*?)\]',
                    content)[0]

                linkurl = 'http://www.chengde.gov.cn' + co[0]
                detail_res = requests.get(linkurl).content.decode('utf-8')
                Html = etree.HTML(detail_res)
                # div = Html.xpath("/html/body/div/div[2]")[0]
                infocontent = html.unescape(
                    etree.tostring(Html, method='html').decode()).replace(
                        "'", " ").replace('"', ' ')
                title = co[1]
                publicTime = co1[1]
                select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                if select == None:
                    uid = uuid.uuid4()
                    Mysql.insert_xinwen_baseinfo(uid=uid,
                                                 regionCode='067000',
                                                 regionName='河北省',
                                                 areaRegion='承德市',
                                                 publicTime=publicTime,
                                                 linkurl=linkurl,
                                                 title=title,
                                                 dataResource='',
                                                 yewuType='',
                                                 infoType='',
                                                 infoState='',
                                                 isok='',
                                                 isdeal='')
                    Mysql.insert_xinwen_detailinfo(uid=uid,
                                                   infocontent=infocontent)
                else:
                    print('标题存在')
Пример #5
0
def shengyw():
    url = 'http://www.hebei.gov.cn/hebei/13863674/13871225/index.html'
    tt = requests.get(url).content.decode('utf-8')
    pages = re.findall('totalpage="(\d+)"', tt)[0]
    for page in range(1, int(pages) + 1):
        url1 = f'http://www.hebei.gov.cn/eportal/ui?pageId=13871225&currentPage={page}'
        tt = requests.get(url1).content.decode('utf-8').replace(
            '\n', '').replace('\r', '').replace('\t', '')
        contents = re.findall(
            '<a href="(.*?)" onclick="void\(0\)" target="_blank" title="(.*?)" istitle="true">(.*?)</a> <span class="date" style="font-size: 12px;color: #898989;padding-left: 5px;">(.*?)</span> </li>',
            tt)
        for content in contents:
            linkurl = 'http://www.hebei.gov.cn' + content[0]
            detail_res = requests.get(linkurl).content.decode('utf-8')
            Html = etree.HTML(detail_res)
            div = Html.xpath(
                '//*[@id="fadd83fc626241d9937b20353ca675eb"]/div[2]')[0]
            infocontent = html.unescape(
                etree.tostring(div, method='html').decode()).replace(
                    "'", " ").replace('"', ' ')
            title = content[1]
            publicTime = content[3]
            select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
            if len(select) == 0:
                uid = uuid.uuid4()
                Mysql.insert_xinwen_baseinfo(uid=uid,
                                             regionCode='050000-075000',
                                             regionName='河北省',
                                             areaRegion='河北省',
                                             publicTime=publicTime,
                                             linkurl=linkurl,
                                             title=title,
                                             dataResource='',
                                             yewuType='',
                                             infoType='',
                                             infoState='',
                                             isok='',
                                             isdeal='')
                Mysql.insert_xinwen_detailinfo(uid=uid,
                                               infocontent=infocontent)
            else:
                print('标题存在')
Пример #6
0
def guo():  # 国务院新闻
    url = 'http://sousuo.gov.cn/column/19423/0.htm'
    tt = requests.get(url).content.decode('utf-8')
    pages = re.findall('共(\d+)页', tt)[0]
    for page in range(int(pages)):
        url1 = f'http://sousuo.gov.cn/column/19423/{page}.htm'
        tt1 = requests.get(url1).content.decode('utf-8').replace(
            '\n', '').replace('\r', '').replace('\t', '')
        contents = re.findall(
            '<li><h4><a href="(.*?)" target="_blank">(.*?)</a><span class="date">(.*?)</span></h4></li>',
            tt1)
        for content in contents:
            linkurl = content[0]
            detail_res = requests.get(linkurl).content.decode('utf-8')
            Html = etree.HTML(detail_res)
            div = Html.xpath('/html/body/div[3]/div[2]/div[1]')[0]
            infocontent = html.unescape(
                etree.tostring(div, method='html').decode()).replace(
                    "'", " ").replace('"', ' ')
            title = content[1]
            publicTime = content[2]
            select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
            if len(select) == 0:
                uid = uuid.uuid4()
                Mysql.insert_xinwen_baseinfo(uid=uid,
                                             regionCode='000000',
                                             regionName='国务院',
                                             areaRegion='全国',
                                             publicTime=publicTime,
                                             linkurl=linkurl,
                                             title=title,
                                             dataResource='',
                                             yewuType='',
                                             infoType='',
                                             infoState='',
                                             isok='',
                                             isdeal='')
                Mysql.insert_xinwen_detailinfo(uid=uid,
                                               infocontent=infocontent)
            else:
                print('标题存在')
Пример #7
0
def pingxiang():
    try:
        for num in range(1,4):
            url=f'http://www.jxsggzy.cn/web/xwzx/00700{num}/1.html'
            tt = requests.get(url).content.decode('utf-8')
            pages = re.findall('id="index">1/(\d+)</span>', tt)[0]
            print(f'江西省公共交易中心共{pages}页')
            for page in range(1, int(pages) + 1):
                url1=url.replace('1.html',f'{page}.html')
                tt = requests.get(url1).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '')
                contents = re.findall('<li class="ewb-list-node clearfix">                            <a href="(.*?)"  title="(.*?)" target="_blank" class="ewb-list-name">(.*?)</a>                            <span class="ewb-list-date">(.*?)</span> ', tt)
                for con in range(1,len(contents)):
                    content=contents[con]
                    title = content[1]
                    publicTime = content[3]
                    linkurl = 'http://www.jxsggzy.cn' + content[0]
                    if re.findall('pdf|doc',content[0]):
                        infocontent='<embed src="'+linkurl+'" >'
                        urllib.request.urlretrieve(quote(linkurl, safe='/:?='), r'D:\lm\xinwen\江西省公共资源交易中心\\' + title + '.jpg')
                    else:
                        detail_res = requests.get(linkurl).content.decode('utf-8')
                        Html = etree.HTML(detail_res)
                        qufen='江西省公共交易中心'+Html.xpath("//p[@class='ewb-location-content']/span/text()")[0]
                        infocontent = html.unescape(etree.tostring(Html, method='html').decode()).replace("'", " ").replace(
                            '"', ' ')  # html
                    select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                    if len(select)==0:
                        uid = uuid.uuid4()
                        Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市',
                                                     publicTime=publicTime, linkurl=linkurl, title=title,
                                                     dataResource='', yewuType='江西省公共交易中心', infoType='', infoState='', isok='',
                                                     isdeal='')
                        Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent)
                        print(f'{num} 标题【{title}】写入成功')
                    else:
                        print(f'{num} 标题【{title}】存在')
                print('-'*50+f'{num} 江西省公共交易中心第{page}页已写完'+'-'*50)
    except Exception as e:
        print('蚌埠\t', e)
        return pingxiang()
Пример #8
0
def ganzhou():
    try:
        url1s=[
            'http://www.ganzhou.gov.cn/c100022/list.shtml',  # 政务动态
            'http://www.ganzhou.gov.cn/c100023/list.shtml',  # 通知公告
            # 'http://www.ganzhou.gov.cn/c100024/list_bmqx.shtml',  # 部门动态
            # 'http://www.ganzhou.gov.cn/c100025/list_bmqx.shtml',  # 区县动态
            # 'http://www.ganzhou.gov.cn/c100026/list.shtml',  # 便民提示
            # 'http://www.ganzhou.gov.cn/c100027/list.shtml',  # 央网推荐
            # 'http://www.ganzhou.gov.cn/c100028/list.shtml',  # 省网推荐
            # 'http://www.ganzhou.gov.cn/c100029/list.shtml',  # 市外媒体
            # 'http://www.ganzhou.gov.cn/c100030/list.shtml',  # 新闻发布会
            # 'http://www.ganzhou.gov.cn/c100032/list.shtml',  # 专题专栏
        ]
        for url1 in url1s:
            print("程序已启动,稍等几秒")

            for page in range(1,37):
                if page==1:
                    tt = requests.get(url1,proxies=ipmax()).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '')
                else:
                    url2=url1.replace('list.shtml',f'list_{page}.shtml').replace('bmqx.shtml',f'bmqx_{page}.shtml')
                    tt = requests.get(url2).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t',
                                                                                                            '')
                contents1 = re.findall('<div class="bd">(.*?)text/javascript', tt)
                contents = re.findall('<li><a href="(.*?)" target="_blank" title=\'(.*?)\'  >(.*?)</a><span>(.*?)</span>',contents1[0])
                for content in contents:
                    if re.findall('mp.weixin',content[0]):
                        linkurl=content[0]
                        # detail_res = requests.get(linkurl,proxies=ipmax()).content.decode('utf-8')
                        # Html = etree.HTML(detail_res)
                        # div = Html.xpath("//div[@id='page-content']")[0]
                        # infocontent = html.unescape(etree.tostring(div, method='html').decode()).replace("'", " ").replace( '"', ' ')
                    else:
                        linkurl = 'http://www.ganzhou.gov.cn' + content[0]
                        # detail_res = requests.get(linkurl,proxies=ipmax()).content.decode('utf-8')
                        # Html = etree.HTML(detail_res)
                        # div = Html.xpath('/html/body/div[4]')[0]
                        # infocontent = html.unescape(etree.tostring(div, method='html').decode()).replace("'", " ").replace('"',
                        #                                                                                                ' ')
                    title = content[1].replace(':',':')
                    publicTime = content[3]
                    s = publicTime.replace('/', '-')
                    t = int(datetime.strptime(s, '%Y-%m-%d').timestamp())
                    if t >= 1570896000:
                        select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                        if len(select) == 0:
                            uid = uuid.uuid4()
                            Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='赣州市',
                                                        publicTime=publicTime, linkurl=linkurl, title=title,
                                                        dataResource='', yewuType='人民政府', infoType='', infoState='', isok='',
                                                        isdeal='')
                            Mysql.insert_xinwen_detailinfo(uid=uid, infocontent='')
                            print(f'标题【{title}】写入成功')

                        else:
                            print(f'标题【{title}】存在')
                    else:
                        break
                print('-' * 50 + f'赣州市第{page}页已完成' + '-' * 50)

    #         chromeOptions = webdriver.ChromeOptions()
    #         chromeOptions.add_experimental_option('w3c', False)
    #         chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
    #         chromeOptions.add_argument('--headless')  # 隐藏浏览器
    #         # chromeOptions.add_argument(f'--proxy-server={ipmax()}')
    #         driver = webdriver.Chrome(options=chromeOptions,  executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
    #         driver.get(url=url1)
    #         aoo_11 = driver.page_source  # html
    #         pages=re.findall('总共(\d+)页',aoo_11)
    #         print(f'共{pages[0]}页')
    #         for aa in range(1, int(pages[0])):
    #             if driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']"):
    #                 driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']").clear()  # 清除文本框内容
    #             else:
    #                 driver.find_element_by_xpath("//input[@id='ctl00$ContentPlaceHolder1$AspNetPager1_input']").clear()  # 清除文本框内容
    #             driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[@id='CP']").send_keys(aa)  # 搜索框输入内容
    #             driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[2]").click()  # 点击一下按钮
    #
    #             aoo_1 = driver.page_source  # html
    #             html_1 = etree.HTML(aoo_1)
    #             list_num = html_1.xpath(f"//table/tbody/tr[1]/td[@class='font_hei14']/a")  # 详情url
    #             for i in range(1, len(list_num)+1):  # 一页20条数据
    #                 qufen ='人民政府'+html_1.xpath(f"/html/body/table[3]/tbody/tr/td[3]/table/tbody/tr[1]/td/a[4]/text()")[0].strip()  # 区分
    #                 link = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/@href")[0].strip()  # 详情url
    #                 title = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/text()")[0].strip()  # 标题
    #                 publicTime = html_1.xpath(f"//tr[2]/td[@class='borderhui']/table[{i}]/tbody/tr[1]/td[@class='font_hui12']/text()")[0].strip().replace('\n','') .replace('[','') .replace(']','') .replace('                ','')  # 时间
    #                 s = publicTime.replace('/', '-')
    #                 t = int(datetime.strptime(s, '%Y-%m-%d').timestamp())
    #                 if t >= 1570896000:
    #
    #                     if re.findall('xinhuan',link):
    #                         linkurl=link
    #                     else:
    #                         linkurl = url1 + link[1:]  # url
    #                     driver.find_element_by_xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a").click()
    #                     driver.switch_to.window(driver.window_handles[-1])
    #                     detail_res=driver.page_source
    #                     Html = etree.HTML(detail_res)
    #                     if Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']"):
    #                         div1 = Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']")[0]  # 当前栏目
    #                         div2 = Html.xpath("//table[@class='borderhui']/tbody/tr/td")[0]       # text
    #                     elif Html.xpath("//div/div/div[@class='news-position']"):
    #                         div1 = Html.xpath("//div/div/div[@class='news-position']")[0]  # 当前栏目
    #                         div2 = Html.xpath("//div/div/div[@id='p-detail']")[0]  # text
    #                     elif Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']"):
    #                         div1 = Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']")[0]  # 当前栏目
    #                         div2 = Html.xpath("//div[@class='article oneColumn pub_border']")[0]  # text
    #                     else:
    #                         div1 = Html.xpath("//div[@class='xl-main']/div[@class='container']")[0]  # 当前栏目
    #                         div2 = '' # text
    #                     try:
    #                         infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace(
    #                             '"', ' ')  # html
    #                         infocontent2 = html.unescape(etree.tostring(div2, method='html').decode()).replace("'", " ").replace(
    #                                             '"', ' ')  # html
    #                         infocontent=infocontent1+infocontent2
    #                     except:
    #                         infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'",  " ").replace( '"', ' ')  # html
    #                         infocontent=infocontent1
    #                     if re.findall('src="(.*?)" oldsrc=',infocontent):
    #                         infocontent=infocontent.replace('src=.\./',url1+link[1:7]+'/')
    #                     else:infocontent=infocontent
    #                     select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
    #                     if len(select)==0:
    #                         uid = uuid.uuid4()
    #                         Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市',
    #                                             publicTime=publicTime, linkurl=linkurl, title=title,
    #                                             dataResource='', yewuType='人民政府', infoType='', infoState='', isok='',
    #                                             isdeal='')
    #                         Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent)
    #                         print(f'标题【{title}】写入成功')
    #
    #                     else:
    #                         print(f'标题【{title}】存在')
    #                     driver.back()  # 返回上一页
    #                     time.sleep(1)
    #             print('-' * 50 + f'萍乡第{aa}页已完成' + '-' * 50)
    except Exception as e:
        print('蚌埠\t', e)
        return ganzhou()
Пример #9
0
def pingxiang1():
    try:
        url1s=[
            # 'http://www.pingxiang.gov.cn/xw/pxyw/zwyw1/',  # 政务要闻  25页
            # 'http://www.pingxiang.gov.cn/xw/pxyw/ldyl/',  # 领导言论  16页
            # 'http://www.pingxiang.gov.cn/xw/pxyw/zyhy/',  # 重要会议    18页
            # 'http://www.pingxiang.gov.cn/xw/pxyw/zyhy_44485/',  # 专题会议
            # 'http://www.pingxiang.gov.cn/xw/pxyw/bmdt/',    # 部门动态
            # 'http://www.pingxiang.gov.cn/xw/pxyw/xqxw/',    # 区县新闻
            # 'http://www.pingxiang.gov.cn/xw/pxyw/mrzw/',    # 每日政务
            'http://www.pingxiang.gov.cn/xw/pxyw/tpxw/',    # 图片新闻
        ]
        for url1 in url1s:
            print("程序已启动,稍等几秒")
            chromeOptions = webdriver.ChromeOptions()
            chromeOptions.add_experimental_option('w3c', False)
            chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
            chromeOptions.add_argument('--headless')  # 隐藏浏览器
            # chromeOptions.add_argument(f'--proxy-server={ipmax()}')
            driver = webdriver.Chrome(options=chromeOptions,  executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
            driver.get(url=url1)
            aoo_11 = driver.page_source  # html
            pages=re.findall('总共(\d+)页',aoo_11)
            print(f'共{pages[0]}页')
            for aa in range(1, int(pages[0])):
                if driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']"):
                    driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']").clear()  # 清除文本框内容
                else:
                    driver.find_element_by_xpath("//input[@id='ctl00$ContentPlaceHolder1$AspNetPager1_input']").clear()  # 清除文本框内容
                driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[@id='CP']").send_keys(aa)  # 搜索框输入内容
                driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[2]").click()  # 点击一下按钮

                aoo_1 = driver.page_source  # html
                html_1 = etree.HTML(aoo_1)
                list_num = html_1.xpath(f"//table/tbody/tr[1]/td[@class='font_hei14']/a")  # 详情url
                for i in range(1, len(list_num)+1):  # 一页20条数据
                    link = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/@href")[0].strip()  # 详情url
                    title = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/text()")[0].strip()  # 标题
                    publicTime = html_1.xpath(f"//tr[2]/td[@class='borderhui']/table[{i}]/tbody/tr[1]/td[@class='font_hui12']/text()")[0].strip().replace('\n','') .replace('[','') .replace(']','') .replace('                ','')  # 时间
                    # tt = int(time.time())
                    s = publicTime.replace('/', '-')
                    t = int(datetime.strptime(s, '%Y-%m-%d').timestamp())
                    if t >= 1570896000:
                        if re.findall('xinhuan',link):
                            linkurl=link
                        else:
                            linkurl = url1 + link[1:]  # url
                        driver.find_element_by_xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a").click()
                        driver.switch_to.window(driver.window_handles[-1])
                        detail_res=driver.page_source
                        Html = etree.HTML(detail_res)
                        if Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']"):
                            div1 = Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']")[0]  # 当前栏目
                            div2 = Html.xpath("//table[@class='borderhui']/tbody/tr/td")[0]       # text
                        elif Html.xpath("//div/div/div[@class='news-position']"):
                            div1 = Html.xpath("//div/div/div[@class='news-position']")[0]  # 当前栏目
                            div2 = Html.xpath("//div/div/div[@id='p-detail']")[0]  # text
                        elif Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']"):
                            div1 = Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']")[0]  # 当前栏目
                            div2 = Html.xpath("//div[@class='article oneColumn pub_border']")[0]  # text
                        else:
                            div1 = Html.xpath("//div[@class='xl-main']/div[@class='container']")[0]  # 当前栏目
                            div2 = '' # text
                        try:
                            infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace(
                                '"', ' ')  # html
                            infocontent2 = html.unescape(etree.tostring(div2, method='html').decode()).replace("'", " ").replace(
                                                '"', ' ')  # html
                            infocontent=infocontent1+infocontent2
                        except:
                            infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'",  " ").replace( '"', ' ')  # html
                            infocontent=infocontent1
                        if re.findall('src="(.*?)" oldsrc=',infocontent):
                            infocontent=infocontent.replace('src=.\./',url1+link[1:7]+'/')
                        else:infocontent=infocontent
                        select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                        if len(select)==0:
                            uid = uuid.uuid4()
                            Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市',
                                                publicTime=publicTime, linkurl=linkurl, title=title,
                                                dataResource='', yewuType='人民政府', infoType='', infoState='', isok='',
                                                isdeal='')
                            Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent)
                            print(f'标题【{title}】写入成功')

                        else:
                            print(f'标题【{title}】存在')
                        driver.back()  # 返回上一页
                        time.sleep(1)
                print('-' * 50 + f'萍乡第{aa}页已完成' + '-' * 50)
    except Exception as e:
        print('蚌埠\t', e)
        return pingxiang1()
Пример #10
0
def tangshan():
    try:
        url1s = [
            f'http://www.tangshan.gov.cn/zhuzhan/zhengwuxinwen/index.html',  # 政务新闻
            # f'http://www.qhd.gov.cn/front_pcsec.do?tid=BE16A305B662511F9C82516BD16F3C24&p=1',  #  部门动态
            # f'http://www.qhd.gov.cn/front_pcsec.do?tid=677638128C3E53D4C629F745917A4CD8&p=1',  # 县区动态
        ]
        for url1 in url1s:
            contents1 = requests.get(url1).content.decode('utf-8').replace(
                '\n', '').replace('\r', '').replace('\t', '')
            pages = int(re.findall('index_(\d+).html">尾页', contents1)[0])
            for page in range(2, pages + 1):
                if page == 1:
                    url = url1
                else:
                    url = url1.replace('index.html', f'index_{page}.html')
                contents2 = requests.get(url).content.decode('utf-8').replace(
                    '\n', '').replace('\r', '').replace('\t', '')

                contents = [
                    re.findall(
                        '<li><span class="title"><a href="(.*?)" target="_blank" >(.*?)</a></span><span class="date">(.*?)</span><span class="clear"></span></li>',
                        contents2),
                    re.findall(
                        '</span>                        <a href="(.*?)" target="_blank">(.*?)</a></div><div class="seclisttime fl">(.*?)</div></div>',
                        contents2),
                ]
                for content in contents:
                    if len(content) > 0:
                        content = content[0]
                        linkurl = f'http://www.tangshan.gov.cn' + content[
                            0].strip()
                        detail_res = requests.get(linkurl).content.decode(
                            'utf-8')
                        Html = etree.HTML(detail_res)
                        infocontent = html.unescape(
                            etree.tostring(Html,
                                           method='html').decode()).replace(
                                               "'", " ").replace('"', ' ')
                        title = content[1].strip()
                        publicTime = content[2].strip()
                        select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                        if select == None:
                            uid = uuid.uuid4()
                            Mysql.insert_xinwen_baseinfo(uid=uid,
                                                         regionCode='063000',
                                                         regionName='河北省',
                                                         areaRegion='唐山市',
                                                         publicTime=publicTime,
                                                         linkurl=linkurl,
                                                         title=title,
                                                         dataResource='',
                                                         yewuType='',
                                                         infoType='',
                                                         infoState='',
                                                         isok='',
                                                         isdeal='')
                            Mysql.insert_xinwen_detailinfo(
                                uid=uid, infocontent=infocontent)
                        else:
                            print('标题存在')
    except Exception as e:
        print('蚌埠\t', e)
        return tangshan()
Пример #11
0
def qinhuangdao():
    try:
        url1s = [
            # f'http://www.qhd.gov.cn/front_pcsec.do?tid=A44A512C86E7FA51FEB2B9B098047A46&p=1',  # 本地动态
            # f'http://www.qhd.gov.cn/front_pcsec.do?tid=BE16A305B662511F9C82516BD16F3C24&p=1',  # 部门动态
            f'http://www.qhd.gov.cn/front_pcsec.do?tid=677638128C3E53D4C629F745917A4CD8&p=1',  # 县区动态
        ]
        for url1 in url1s:
            contents1 = requests.get(url1).content.decode('utf-8').replace(
                '\n', '').replace('\r', '').replace('\t', '')
            pages = int(re.findall('共(\d+)页', contents1)[0])
            for page in range(1, pages + 1):
                url = url1.replace('p=1', f'p={page}')
                contents2 = requests.get(url).content.decode('utf-8').replace(
                    '\n', '').replace('\r', '').replace('\t', '')

                contents = [
                    re.findall(
                        'fl"><a href="(.*?)" target="_blank">(.*?)</a></div><div class="seclisttime fl">(.*?)</div></div>',
                        contents2),
                    re.findall(
                        '</span>                        <a href="(.*?)" target="_blank">(.*?)</a></div><div class="seclisttime fl">(.*?)</div></div>',
                        contents2),
                ]
                for content in contents:
                    if len(content) > 0:
                        content = content[0]
                        linkurl = f'http://www.qhd.gov.cn/' + content[0].strip(
                        )
                        detail_res = requests.get(linkurl).content.decode(
                            'utf-8')
                        Html = etree.HTML(detail_res)
                        infocontent = html.unescape(
                            etree.tostring(Html,
                                           method='html').decode()).replace(
                                               "'", " ").replace('"', ' ')
                        title = content[1].strip()
                        publicTime = content[2].strip()
                        select = Mysql.select_xinwen(title=title)  # 查询标题是否存在
                        if select == None:
                            uid = uuid.uuid4()
                            Mysql.insert_xinwen_baseinfo(uid=uid,
                                                         regionCode='067000',
                                                         regionName='河北省',
                                                         areaRegion='承德市',
                                                         publicTime=publicTime,
                                                         linkurl=linkurl,
                                                         title=title,
                                                         dataResource='',
                                                         yewuType='',
                                                         infoType='',
                                                         infoState='',
                                                         isok='',
                                                         isdeal='')
                            Mysql.insert_xinwen_detailinfo(
                                uid=uid, infocontent=infocontent)
                        else:
                            print('标题存在')
    except Exception as e:
        print('蚌埠\t', e)
        return qinhuangdao()