Пример #1
0
def chuli(publictime,href,driver,url,title,city,xpath1):
    try:
        insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        if re.findall('http', href):
            link = href

        elif '../' in href:
            driver.find_element_by_xpath(f"{xpath1}/a").click()
            b_handle = driver.current_window_handle  # 获取当前页句柄
            handles = driver.window_handles  # 获取所有页句柄
            s_handle = None
            for handle in handles:
                if handle != b_handle:
                    s_handle = handle
            driver.switch_to.window(s_handle)  # 在新窗口操作
            link = driver.current_url  # 2级页面的url
            driver.close()
            driver.switch_to.window(b_handle)  # 在新窗口操作
        elif './' in href:
            link = url + href.replace('./', '')
        elif href[0] == '/':
            if re.findall(r'http(.*?)\.cn', url):
                link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn' + href
            else:
                link = 'http' + re.findall(r'http(.*?)\.com', url)[0] + '.cn' + href
        else:
            link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn/'+href
        uid = uuid.uuid4()
        Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link,
                           biaoti=title, tianjiatime=insertDBtime, zt='0')
        print(f'--{city}-【{title}】写入成功')

    except Exception as e:
        print('处理\t', e)
Пример #2
0
def chuli1(publictime, href, url, title, city):
    try:
        insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        if re.findall('http', href):
            link = href
        elif './' in href:
            link = url + href.replace('./', '')
        elif href[0] == '/':
            link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn' + href
        else:
            link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn/' + href
        uid = uuid.uuid4()
        Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link,
                           biaoti=title, tianjiatime=insertDBtime, zt='0')
        print(f'--{city}-【{title}】写入成功')

    except Exception as e:
        print('处理\t', e)
Пример #3
0
def aletai(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        driver.maximize_window()
        urls = {
            'http://www.alt.gov.cn/zwxx/001003/listPage.html': 436,  # 人民政府  自治区要闻
            'http://www.alt.gov.cn/zwxx/001001/listPage.html': 47,  # 人民政府  政务动态
            'http://www.alt.gov.cn/zwxx/001004/listPage.html': 20,  # 人民政府  乡镇场动态
            'http://www.alt.gov.cn/zwxx/001005/listPage.html': 32,  # 人民政府  部门动态
            'http://www.alt.gov.cn/zwxx/001006/listPage.html': 5,  # 人民政府  公示公告

            }
        for url, pages in zip(urls.keys(), urls.values()):
            driver.get(url)
            con = driver.page_source
            html_2 = etree.HTML(con)

            xpath = "//div[@class='ewb-pl20']/ul/li"
            length = len(html_2.xpath(xpath))
            po = 0
            for page in range(1, pages+1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(1, length):
                    # if 'www' in url and i%5==0:
                    #     pass
                    # else:
                        lengt = len(html_1.xpath(xpath))
                        xpath1 = xpath.replace('/ul/li', f'/ul/li[{i}]')
                        href = html_1.xpath(f"{xpath1}/a/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace(
                            '\r', '')
                        publictime = html_1.xpath(f"{xpath1}/span/text()")[0].strip().replace('/', '-')

                        select = Mysql.select_xw_nr1(biaoti=title,dijishi=name)  # 查询标题是否存在

                        if select == None:
                            publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d")))
                            # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d")))
                            if publictime_times >= jiezhi_time:
                                if 'jxcq' in url:
                                    insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                                    link = 'http://www.jxcq.org' + href
                                    uid = uuid.uuid4()
                                    Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime,
                                                       url=link,
                                                       biaoti=title, tianjiatime=insertDBtime, zt='0')
                                    print(f'--{city}-【{title}】写入成功')
                                else:
                                    chuli(publictime, href, driver, url, title, city,xpath1)
                            else:
                                po += 1
                                break
                        if i == lengt:
                                if lengt < length - 1:
                                    break
                                else:
                                    if page != pages:
                                        try:
                                            driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click()
                                        except:
                                            try:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页'))
                                            except:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页'))
                                break
    except Exception as e:
        print('阿勒泰\t', e)
        driver.close()
        return aletai(name)
Пример #4
0
def tulufan(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        driver.maximize_window()
        urls = {
            'http://www.tlf.gov.cn/ztlm/tlfxw.htm': 35,  # 人民政府  吐鲁番新闻
            'http://www.tlf.gov.cn/ztlm/gsggtz.htm': 19,  # 人民政府  公示公告通知
            'http://www.tlf.gov.cn/ztlm/xsdt.htm': 16,  # 人民政府  >县区动态
            'http://www.tlf.gov.cn/ztlm/bmdt.htm': 12,  # 人民政府  >部门动态
            'http://www.tlf.gov.cn/ztlm/jnwxw.htm': 21,  # 人民政府  疆内外新闻




            }
        for url, pages in zip(urls.keys(), urls.values()):
            driver.get(url)
            con = driver.page_source
            html_2 = etree.HTML(con)

            xpath="//table[@class='winstyle11251']/tbody/tr"
            length = len(html_2.xpath(xpath))
            po = 0
            for page in range(1, pages+1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(1, length):
                    if 'www' in url and i%5==0:
                        pass
                    else:
                        lengt = len(html_1.xpath(xpath))
                        xpath1 = xpath+f'[{i}]'

                        href = html_1.xpath(f"{xpath1}/td[2]/a/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/td[2]/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace(
                            '\r', '')
                        publictime = html_1.xpath(f"{xpath1}/td[3]/span/text()")[0].strip().replace('/', '-')

                        select = Mysql.select_xw_nr1(biaoti=title,dijishi=name)  # 查询标题是否存在

                        if select == None:
                            publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d")))
                            # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d")))
                            if publictime_times >= jiezhi_time:
                                if 'jxcq' in url:
                                    insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                                    link = 'http://www.jxcq.org' + href
                                    uid = uuid.uuid4()
                                    Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime,
                                                       url=link,
                                                       biaoti=title, tianjiatime=insertDBtime, zt='0')
                                    print(f'--{city}-【{title}】写入成功')
                                else:
                                    chuli(publictime, href, driver, url, title, city,xpath1)
                            else:
                                po += 1
                                break
                        if i == lengt:
                                if lengt < length - 1:
                                    break
                                else:
                                    if page != pages:
                                        try:
                                            driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click()
                                        except:
                                            try:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页'))
                                            except:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页'))
                                break
    except Exception as e:
        print('吐鲁番\t', e)
        driver.close()
        return tulufan(name)
Пример #5
0
def wulumuqui(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        driver.maximize_window()
        urls = {
            'http://zwfw.xinjiang.gov.cn/xinjiangggzy/zwgk/002004/tradingCommon.html': 2,  # 公共资源中心  通知公告
            'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=10005': 86,  # 人民政府  乌鲁木齐要闻
            'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=12034': 59,  # 人民政府  自治区要闻
            'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=12115': 61,  # 人民政府  通知公告
            'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=10006': 2,  # 人民政府  政策解读
            }
        for url, pages in zip(urls.keys(), urls.values()):
            driver.get(url)
            con = driver.page_source
            html_2 = etree.HTML(con)
            if 'zwfw' in url:
                xpath="//div[@class='ewb-colu-bd']/div/ul/li/div"
                length = len(html_2.xpath(xpath)) + 2
                ii=2
            else:
                xpath = "//ul[@class='commonList_dot am-padding-top-sm am-padding-bottom-0 commonList_dot_Listnews']/li"
                length = len(html_2.xpath(xpath)) + 1
                ii = 1
            po = 0
            for page in range(1, pages+1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(ii, length):
                    if 'www' in url and i%6==0:
                        pass
                    else:

                        lengt = len(html_1.xpath(xpath))
                        xpath1 = xpath.replace('/ul/li', f'/ul/li[{i}]').replace(']/li', f']/li[{i}]')
                        if 'zwfw' in url:
                            href = html_1.xpath(f"{xpath1}/div/a/@href")[0].strip()
                            title = html_1.xpath(f"{xpath1}/div/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace(
                                '\r', '')
                            publictime = html_1.xpath(xpath1+"/span/text()")[0].strip().replace('/', '-').replace('年', '-').replace('月', '-').replace('日', '')
                        else:
                            href = html_1.xpath(f"{xpath1}/a/@href")[0].strip()
                            title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace(
                                '\r', '')
                            publictime = html_1.xpath(f"{xpath1}/span/text()")[0].strip().replace('/', '-')

                        select = Mysql.select_xw_nr1(biaoti=title,dijishi=name)  # 查询标题是否存在

                        if select == None:
                            publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d")))
                            # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d")))
                            if publictime_times >= jiezhi_time:
                                if 'jxcq' in url:
                                    insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                                    link = 'http://www.jxcq.org' + href
                                    uid = uuid.uuid4()
                                    Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime,
                                                       url=link,
                                                       biaoti=title, tianjiatime=insertDBtime, zt='0')
                                    print(f'--{city}-【{title}】写入成功')
                                else:
                                    chuli(publictime, href, driver, url, title, city,xpath1)
                            else:
                                po += 1
                                break
                        if i == lengt:
                                if lengt < length - 1:
                                    break
                                else:
                                    if page != pages:
                                        try:
                                            driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click()
                                        except:
                                            try:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页'))
                                            except:
                                                driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页'))
                                break
    except Exception as e:
        print('乌鲁木齐\t', e)
        driver.close()
        return wulumuqui(name)
Пример #6
0
def sansha(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        driver.maximize_window()
        urls = {
            'http://zw.hainan.gov.cn/ggzy/ssggzy/GGtzgg/index.jhtml': 1,  # 公共资源中心 通知公告
            'http://zw.hainan.gov.cn/ggzy/ssggzy/xwdt1/index.jhtml': 1,  # 公共资源中心 新闻动态
            'http://www.sansha.gov.cn/sansha/sysdt/nlist2_new.shtml': 42,  # 人民政府 三沙动态
            'http://www.sansha.gov.cn/sansha/zwfwxxgs/nlist2.shtml': 2,  # 人民政府 三沙信息公示
            'http://www.hainan.gov.cn/hainan/zxjd/list3.shtml': 2,  # 人民政府 最新解读

        }
        for url, pages in zip(urls.keys(), urls.values()):
            driver.get(url)
            con = driver.page_source
            html_2 = etree.HTML(con)
            if 'zw' in url:
                xpath = "//table[@class='newtable']/tbody/tr[1]/td[2]/a"
            else:
                xpath = "//div[@class='list_1']/ul/li"
            length = len(html_2.xpath(xpath)) + 1
            po = 0
            for page in range(1, pages+1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(1, length):
                  # if 'plan' in url and i%6==0:
                  #       pass
                  # else:

                    lengt = len(html_1.xpath(xpath))
                    xpath1 = xpath.replace('/div/div/div/a', f'/div/div[{i}]/div/a').replace('tr/td[', f'tr[{i}]/td[').replace('ul/li', f'ul/li[{i}]')
                    if 'zw' in url:
                        href = html_1.xpath(f"{xpath1}/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '')
                        publictime = html_1.xpath(xpath1.replace('[2]/a', "[3]") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间:', '')
                    else:
                        href = html_1.xpath(f"{xpath1}/a/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t','').replace(
                            '\r', '')
                        publictime = html_1.xpath(f"{xpath1}/em/text()")[0].strip().replace('\n', '').replace('[', '').replace(']','').replace('日', '').replace('/', '-')

                    select = Mysql.select_xw_nr1(biaoti=title, dijishi=name)  # 查询标题是否存在

                    if select == None:
                        publictime_times = int(time.mktime(time.strptime(publictime.replace('[', '').replace(']',''), "%Y-%m-%d")))
                        # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d")))
                        if publictime_times >= jiezhi_time:
                            if 'jxcq' in url:
                                insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                                link = 'http://www.jxcq.org' + href
                                uid = uuid.uuid4()
                                Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime,
                                                   url=link,
                                                   biaoti=title, tianjiatime=insertDBtime, zt='0')
                                print(f'--{city}-【{title}】写入成功')
                            else:
                                chuli(publictime, href, driver, url, title, city, xpath1)
                        else:
                            po += 1
                            break
                    if i == lengt:
                        if lengt < length - 1:
                            break
                        else:
                            if page != pages:
                                try:
                                    driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click()
                                except:
                                    try:
                                        driver.execute_script("arguments[0].click();",
                                                              driver.find_element_by_link_text('下页'))
                                    except:
                                        driver.execute_script("arguments[0].click();",
                                                              driver.find_element_by_link_text('下一页'))
                        break
    except Exception as e:
        print('三沙\t', e)
        driver.close()
        return sansha(name)
Пример #7
0
def haikou(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(options=chromeOptions,
                                  executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        driver.maximize_window()
        urls = {
            'http://ggzy.haikou.gov.cn/ywdt/zwdt/02/2_201603-002_1.html': 3,  # 公共资源中心 政务动态
            'http://ggzy.haikou.gov.cn/xxgk/gsgg/02/3_201603-005_1.html': 2,  # 公共资源中心 公示公告
            'http://ggzy.haikou.gov.cn/xxgk/zcwj/02/3_201603-010_1.html': 1,  # 公共资源中心 政策文件
            'http://ggzy.haikou.gov.cn/jdhy/zxjd/02/4_201603-045_1.html': 1,  # 公共资源中心 最新解读
            'http://www.haikou.gov.cn/xxgk/szfbjxxgk/zcfg/szfxzgfxwj/': 3,  # 人民政府 市政府行政规范性文件
            'http://www.haikou.gov.cn/xxgk/szfbjxxgk/zcfg/bmxzgfxwj/': 1,  # 人民政府 部门行政规范性文件
            'http://www.haikou.gov.cn/zfdt/xbzwdt/gqrd/': 36,  # 人民政府 市 政务动态 >> 各区动态
            'http://www.haikou.gov.cn/zfdt/xbzwdt/bmdt/': 22,  # 人民政府 市 政务动态 >> 部门动态
            'http://www.haikou.gov.cn/xxgk/szfbjxxgk/ggtz/': 19,  # 人民政府  公示公告
            'http://www.haikou.gov.cn/tzhk/zcjy/': 1,  # 人民政府  政策机遇
            'http://drc.haikou.gov.cn/ywdt/gzdt/': 11,  # 发改委  工作动态
            'http://drc.haikou.gov.cn/xxxgk/gsgg/': 2,  # 发改委  公示公告
            'http://hkjsj.haikou.gov.cn/xxgk1/gsgg/': 5,  # 住建局  公示公告
            'http://hkjsj.haikou.gov.cn/xxgk1/zcwj/bmwj/': 1,  # 住建局 部门文件
            'http://hkjsj.haikou.gov.cn/jdhy/zxjd/': 1,  # 住建局 最新解读

        }
        for url, pages in zip(urls.keys(), urls.values()):
            driver.get(url)
            con = driver.page_source
            html_2 = etree.HTML(con)
            if 'ggzy' in url:
                xpath = "//div[@id='list_div']/div/div/a"

            elif 'drc' in url:
                xpath = "//div[@class='con-right']/div/div/a"

            elif 'hkjsj' in url:
                xpath = "//div[@class='con-right fr']/div/div/a"

            else:
                xpath = "//div[@class='list-c']/ul/li/p[1]/a"
            length = len(html_2.xpath(xpath)) + 1
            po = 0
            for page in range(1, pages+1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(1, length):
                  # if 'plan' in url and i%6==0:
                  #       pass
                  # else:

                    lengt = len(html_1.xpath(xpath))
                    xpath1 = xpath.replace('/div/div/div/a', f'/div/div[{i}]/div/a').replace('tr/td[', f'tr[{i}]/td[').replace('ul/li', f'ul/li[{i}]')
                    if 'ggzy' in url or  'www' in url or  'drc' in url  or  'hkjsj' in url :
                        href = html_1.xpath(f"{xpath1}/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '')
                        if 'www' in url:
                            publictime = html_1.xpath(xpath1.replace('[1]/a', "[2]/span") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间:', '')
                        elif 'hkjsj' in url:
                            publictime = html_1.xpath(xpath1.replace('div/a', "table/tbody/tr/td[1]") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间:', '')
                        else:
                            publictime = html_1.xpath(xpath1.replace('div/a', "div/span") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间:', '')
                    else:
                        href = html_1.xpath(f"{xpath1}/a/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t','').replace(
                            '\r', '')
                        publictime = html_1.xpath(f"{xpath1}/em/text()")[0].strip().replace('\n', '').replace('[', '').replace(']','').replace(
                            '日', '').replace('/', '-')

                    select = Mysql.select_xw_nr1(biaoti=title, dijishi=name)  # 查询标题是否存在

                    if select == None:
                        publictime_times = int(time.mktime(time.strptime(publictime.replace(' ', '').replace('[', '').replace(']',''), "%Y-%m-%d")))
                        # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d")))
                        if publictime_times >= jiezhi_time:
                            if 'jxcq' in url:
                                insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                                link = 'http://www.jxcq.org' + href
                                uid = uuid.uuid4()
                                Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime,
                                                   url=link,
                                                   biaoti=title, tianjiatime=insertDBtime, zt='0')
                                print(f'--{city}-【{title}】写入成功')
                            else:
                                chuli(publictime, href, driver, url, title, city, xpath1)
                        else:
                            po += 1
                            break
                    if i == lengt:
                        if lengt < length - 1:
                            break
                        else:
                            if page != pages:
                                try:
                                    driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click()
                                except:
                                    try:
                                        driver.execute_script("arguments[0].click();",
                                                              driver.find_element_by_link_text('下页'))
                                    except:
                                        driver.execute_script("arguments[0].click();",
                                                              driver.find_element_by_link_text('下一页'))
                        break
    except Exception as e:
        print('海口\t', e)
        driver.close()
        return haikou(name)
Пример #8
0
def hainan(name):
    global driver
    try:
        city = name
        print(f"{name}程序已启动,稍等几秒")
        # fz_excel(pro, city)  # 复制同款excel表格
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_experimental_option('w3c', False)
        chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
        chromeOptions.add_argument('--headless')  # 隐藏浏览器
        driver = webdriver.Chrome(options=chromeOptions,
                                  executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        driver.maximize_window()
        urls = {
            'http://zw.hainan.gov.cn/ggzy/ggzy/tzgg/index.jhtml': 3,  # 公共资源中心 通知公告
            'http://zw.hainan.gov.cn/ggzy/ggzy/xwdt/index.jhtml': 1,  # 公共资源中心 新闻动态
            'https://www.hainan.gov.cn/hainan/0101/list3_1.shtml': 16,  # 人民政府 公示公告
            'https://www.hainan.gov.cn/common/search/2f11247a5c2c49eeb3cdbb00a8178bc7?_isAgg=false&_pageSize=12&_template=hainan&_channelName=&page=1': 314,  # 人民政府 政务动态
            'https://www.hainan.gov.cn/common/search/55acf8539596d25624059980986aaa78?_isAgg=false&_pageSize=12&_template=hainan&_channelName=&page=1': 326,  # 人民政府 今日海南
            'https://www.hainan.gov.cn/common/search/82cdb5b25e514a1bba6429aef621ce6c?_isAgg=false&_pageSize=12&_template=hainan&sort=publishedTime&_channelName=&page=1': 189,  # 人民政府 省府要闻
            'https://www.hainan.gov.cn/hainan/zxjd/list3.shtml': 2,  # 人民政府 政策解读>最新解读
            'http://plan.hainan.gov.cn/sfgw/zwdt/list3.shtml': 5,  # 发改委  要闻动态 > 政务动态
            'http://plan.hainan.gov.cn/sfgw/gzdt/list3.shtml': 20,  # 发改委  工作动态
            'http://plan.hainan.gov.cn/sfgw/zxdt/list3.shtml': 50,  # 发改委  最新动态
            'http://zjt.hainan.gov.cn/szjt/zwdt/tablist.shtml': 36,  # 住建局  政务动态
            'http://zjt.hainan.gov.cn/szjt/sxxx/iframelist_sx.shtml': 15,  # 住建局  市县信息


        }
        for url, pages in zip(urls.keys(), urls.values()):
            driver.get(url)
            con = driver.page_source
            html_2 = etree.HTML(con)
            if 'zw' in url:
                xpath = "//table[@class='newtable']/tbody/tr/td[2]/a"
            elif 'plan' in url:
                xpath = "//div[@class='Fivelist']/ul/li"
            elif 'zjt' in url:
                xpath = "//div[@class='con-right']/div/div/a"
            else:
                xpath = "//div[@class='cen-div-1 mar-t']/div/div/div/a"
            length = len(html_2.xpath(xpath)) + 1
            po = 0
            for page in range(1, pages+1):
                con = driver.page_source
                html_1 = etree.HTML(con)
                if po > 0:
                    break
                for i in range(1, length):
                  if 'plan' in url and i%6==0:
                        pass
                  else:

                    lengt = len(html_1.xpath(xpath))
                    xpath1 = xpath.replace('/div/div/div/a', f'/div/div[{i}]/div/a').replace('tr/td[', f'tr[{i}]/td[')
                    if 'zw' in url or 'www' in url or 'zjt' in url:
                        href = html_1.xpath(f"{xpath1}/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/text()")[0].strip().replace('\n', '').replace('\t', '').replace(
                            '\r', '')
                        if 'zw' in url:
                            publictime = html_1.xpath(xpath1.replace('[2]/a', "[3]") + "/text()")[0].strip().replace('/', '-').replace('\n', '')
                        else:
                            publictime = html_1.xpath(xpath1.replace('div/a', "table/tbody/tr/td") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间:                    ', '')
                    else:
                        href = html_1.xpath(f"{xpath1}/a/@href")[0].strip()
                        title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t','').replace(
                            '\r', '')
                        publictime = html_1.xpath(f"{xpath1}/em/text()")[0].strip().replace('\n', '').replace('[', '').replace(']','').replace(
                            '日', '').replace('/', '-')

                    select = Mysql.select_xw_nr1(biaoti=title, dijishi=name)  # 查询标题是否存在

                    if select == None:
                        publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d")))
                        # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d")))
                        if publictime_times >= jiezhi_time:
                            if 'jxcq' in url:
                                insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
                                link = 'http://www.jxcq.org' + href
                                uid = uuid.uuid4()
                                Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime,
                                                   url=link,
                                                   biaoti=title, tianjiatime=insertDBtime, zt='0')
                                print(f'--{city}-【{title}】写入成功')
                            else:
                                chuli(publictime, href, driver, url, title, city, xpath1)
                        else:
                            po += 1
                            break
                    if i == lengt:
                        if lengt < length - 1:
                            break
                        else:
                            if page != pages:
                                try:
                                    driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click()
                                except:
                                    try:
                                        driver.execute_script("arguments[0].click();",
                                                              driver.find_element_by_link_text('下页'))
                                    except:
                                        driver.execute_script("arguments[0].click();",
                                                              driver.find_element_by_link_text('下一页'))
                        break
    except Exception as e:
        print('海南\t', e)
        driver.close()
        return hainan(name)