def chuli(publictime,href,driver,url,title,city,xpath1): try: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) if re.findall('http', href): link = href elif '../' in href: driver.find_element_by_xpath(f"{xpath1}/a").click() b_handle = driver.current_window_handle # 获取当前页句柄 handles = driver.window_handles # 获取所有页句柄 s_handle = None for handle in handles: if handle != b_handle: s_handle = handle driver.switch_to.window(s_handle) # 在新窗口操作 link = driver.current_url # 2级页面的url driver.close() driver.switch_to.window(b_handle) # 在新窗口操作 elif './' in href: link = url + href.replace('./', '') elif href[0] == '/': if re.findall(r'http(.*?)\.cn', url): link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn' + href else: link = 'http' + re.findall(r'http(.*?)\.com', url)[0] + '.cn' + href else: link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn/'+href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') except Exception as e: print('处理\t', e)
def chuli1(publictime, href, url, title, city): try: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) if re.findall('http', href): link = href elif './' in href: link = url + href.replace('./', '') elif href[0] == '/': link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn' + href else: link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn/' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') except Exception as e: print('处理\t', e)
def aletai(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.maximize_window() urls = { 'http://www.alt.gov.cn/zwxx/001003/listPage.html': 436, # 人民政府 自治区要闻 'http://www.alt.gov.cn/zwxx/001001/listPage.html': 47, # 人民政府 政务动态 'http://www.alt.gov.cn/zwxx/001004/listPage.html': 20, # 人民政府 乡镇场动态 'http://www.alt.gov.cn/zwxx/001005/listPage.html': 32, # 人民政府 部门动态 'http://www.alt.gov.cn/zwxx/001006/listPage.html': 5, # 人民政府 公示公告 } for url, pages in zip(urls.keys(), urls.values()): driver.get(url) con = driver.page_source html_2 = etree.HTML(con) xpath = "//div[@class='ewb-pl20']/ul/li" length = len(html_2.xpath(xpath)) po = 0 for page in range(1, pages+1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(1, length): # if 'www' in url and i%5==0: # pass # else: lengt = len(html_1.xpath(xpath)) xpath1 = xpath.replace('/ul/li', f'/ul/li[{i}]') href = html_1.xpath(f"{xpath1}/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') publictime = html_1.xpath(f"{xpath1}/span/text()")[0].strip().replace('/', '-') select = Mysql.select_xw_nr1(biaoti=title,dijishi=name) # 查询标题是否存在 if select == None: publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d"))) # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d"))) if publictime_times >= jiezhi_time: if 'jxcq' in url: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) link = 'http://www.jxcq.org' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') else: chuli(publictime, href, driver, url, title, city,xpath1) else: po += 1 break if i == lengt: if lengt < length - 1: break else: if page != pages: try: driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click() except: try: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页')) except: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页')) break except Exception as e: print('阿勒泰\t', e) driver.close() return aletai(name)
def tulufan(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.maximize_window() urls = { 'http://www.tlf.gov.cn/ztlm/tlfxw.htm': 35, # 人民政府 吐鲁番新闻 'http://www.tlf.gov.cn/ztlm/gsggtz.htm': 19, # 人民政府 公示公告通知 'http://www.tlf.gov.cn/ztlm/xsdt.htm': 16, # 人民政府 >县区动态 'http://www.tlf.gov.cn/ztlm/bmdt.htm': 12, # 人民政府 >部门动态 'http://www.tlf.gov.cn/ztlm/jnwxw.htm': 21, # 人民政府 疆内外新闻 } for url, pages in zip(urls.keys(), urls.values()): driver.get(url) con = driver.page_source html_2 = etree.HTML(con) xpath="//table[@class='winstyle11251']/tbody/tr" length = len(html_2.xpath(xpath)) po = 0 for page in range(1, pages+1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(1, length): if 'www' in url and i%5==0: pass else: lengt = len(html_1.xpath(xpath)) xpath1 = xpath+f'[{i}]' href = html_1.xpath(f"{xpath1}/td[2]/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/td[2]/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') publictime = html_1.xpath(f"{xpath1}/td[3]/span/text()")[0].strip().replace('/', '-') select = Mysql.select_xw_nr1(biaoti=title,dijishi=name) # 查询标题是否存在 if select == None: publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d"))) # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d"))) if publictime_times >= jiezhi_time: if 'jxcq' in url: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) link = 'http://www.jxcq.org' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') else: chuli(publictime, href, driver, url, title, city,xpath1) else: po += 1 break if i == lengt: if lengt < length - 1: break else: if page != pages: try: driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click() except: try: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页')) except: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页')) break except Exception as e: print('吐鲁番\t', e) driver.close() return tulufan(name)
def wulumuqui(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.maximize_window() urls = { 'http://zwfw.xinjiang.gov.cn/xinjiangggzy/zwgk/002004/tradingCommon.html': 2, # 公共资源中心 通知公告 'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=10005': 86, # 人民政府 乌鲁木齐要闻 'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=12034': 59, # 人民政府 自治区要闻 'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=12115': 61, # 人民政府 通知公告 'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=10006': 2, # 人民政府 政策解读 } for url, pages in zip(urls.keys(), urls.values()): driver.get(url) con = driver.page_source html_2 = etree.HTML(con) if 'zwfw' in url: xpath="//div[@class='ewb-colu-bd']/div/ul/li/div" length = len(html_2.xpath(xpath)) + 2 ii=2 else: xpath = "//ul[@class='commonList_dot am-padding-top-sm am-padding-bottom-0 commonList_dot_Listnews']/li" length = len(html_2.xpath(xpath)) + 1 ii = 1 po = 0 for page in range(1, pages+1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(ii, length): if 'www' in url and i%6==0: pass else: lengt = len(html_1.xpath(xpath)) xpath1 = xpath.replace('/ul/li', f'/ul/li[{i}]').replace(']/li', f']/li[{i}]') if 'zwfw' in url: href = html_1.xpath(f"{xpath1}/div/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/div/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') publictime = html_1.xpath(xpath1+"/span/text()")[0].strip().replace('/', '-').replace('年', '-').replace('月', '-').replace('日', '') else: href = html_1.xpath(f"{xpath1}/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') publictime = html_1.xpath(f"{xpath1}/span/text()")[0].strip().replace('/', '-') select = Mysql.select_xw_nr1(biaoti=title,dijishi=name) # 查询标题是否存在 if select == None: publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d"))) # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d"))) if publictime_times >= jiezhi_time: if 'jxcq' in url: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) link = 'http://www.jxcq.org' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') else: chuli(publictime, href, driver, url, title, city,xpath1) else: po += 1 break if i == lengt: if lengt < length - 1: break else: if page != pages: try: driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click() except: try: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页')) except: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页')) break except Exception as e: print('乌鲁木齐\t', e) driver.close() return wulumuqui(name)
def sansha(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.maximize_window() urls = { 'http://zw.hainan.gov.cn/ggzy/ssggzy/GGtzgg/index.jhtml': 1, # 公共资源中心 通知公告 'http://zw.hainan.gov.cn/ggzy/ssggzy/xwdt1/index.jhtml': 1, # 公共资源中心 新闻动态 'http://www.sansha.gov.cn/sansha/sysdt/nlist2_new.shtml': 42, # 人民政府 三沙动态 'http://www.sansha.gov.cn/sansha/zwfwxxgs/nlist2.shtml': 2, # 人民政府 三沙信息公示 'http://www.hainan.gov.cn/hainan/zxjd/list3.shtml': 2, # 人民政府 最新解读 } for url, pages in zip(urls.keys(), urls.values()): driver.get(url) con = driver.page_source html_2 = etree.HTML(con) if 'zw' in url: xpath = "//table[@class='newtable']/tbody/tr[1]/td[2]/a" else: xpath = "//div[@class='list_1']/ul/li" length = len(html_2.xpath(xpath)) + 1 po = 0 for page in range(1, pages+1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(1, length): # if 'plan' in url and i%6==0: # pass # else: lengt = len(html_1.xpath(xpath)) xpath1 = xpath.replace('/div/div/div/a', f'/div/div[{i}]/div/a').replace('tr/td[', f'tr[{i}]/td[').replace('ul/li', f'ul/li[{i}]') if 'zw' in url: href = html_1.xpath(f"{xpath1}/@href")[0].strip() title = html_1.xpath(f"{xpath1}/text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') publictime = html_1.xpath(xpath1.replace('[2]/a', "[3]") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间:', '') else: href = html_1.xpath(f"{xpath1}/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t','').replace( '\r', '') publictime = html_1.xpath(f"{xpath1}/em/text()")[0].strip().replace('\n', '').replace('[', '').replace(']','').replace('日', '').replace('/', '-') select = Mysql.select_xw_nr1(biaoti=title, dijishi=name) # 查询标题是否存在 if select == None: publictime_times = int(time.mktime(time.strptime(publictime.replace('[', '').replace(']',''), "%Y-%m-%d"))) # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d"))) if publictime_times >= jiezhi_time: if 'jxcq' in url: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) link = 'http://www.jxcq.org' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') else: chuli(publictime, href, driver, url, title, city, xpath1) else: po += 1 break if i == lengt: if lengt < length - 1: break else: if page != pages: try: driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click() except: try: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页')) except: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页')) break except Exception as e: print('三沙\t', e) driver.close() return sansha(name)
def haikou(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions, executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.maximize_window() urls = { 'http://ggzy.haikou.gov.cn/ywdt/zwdt/02/2_201603-002_1.html': 3, # 公共资源中心 政务动态 'http://ggzy.haikou.gov.cn/xxgk/gsgg/02/3_201603-005_1.html': 2, # 公共资源中心 公示公告 'http://ggzy.haikou.gov.cn/xxgk/zcwj/02/3_201603-010_1.html': 1, # 公共资源中心 政策文件 'http://ggzy.haikou.gov.cn/jdhy/zxjd/02/4_201603-045_1.html': 1, # 公共资源中心 最新解读 'http://www.haikou.gov.cn/xxgk/szfbjxxgk/zcfg/szfxzgfxwj/': 3, # 人民政府 市政府行政规范性文件 'http://www.haikou.gov.cn/xxgk/szfbjxxgk/zcfg/bmxzgfxwj/': 1, # 人民政府 部门行政规范性文件 'http://www.haikou.gov.cn/zfdt/xbzwdt/gqrd/': 36, # 人民政府 市 政务动态 >> 各区动态 'http://www.haikou.gov.cn/zfdt/xbzwdt/bmdt/': 22, # 人民政府 市 政务动态 >> 部门动态 'http://www.haikou.gov.cn/xxgk/szfbjxxgk/ggtz/': 19, # 人民政府 公示公告 'http://www.haikou.gov.cn/tzhk/zcjy/': 1, # 人民政府 政策机遇 'http://drc.haikou.gov.cn/ywdt/gzdt/': 11, # 发改委 工作动态 'http://drc.haikou.gov.cn/xxxgk/gsgg/': 2, # 发改委 公示公告 'http://hkjsj.haikou.gov.cn/xxgk1/gsgg/': 5, # 住建局 公示公告 'http://hkjsj.haikou.gov.cn/xxgk1/zcwj/bmwj/': 1, # 住建局 部门文件 'http://hkjsj.haikou.gov.cn/jdhy/zxjd/': 1, # 住建局 最新解读 } for url, pages in zip(urls.keys(), urls.values()): driver.get(url) con = driver.page_source html_2 = etree.HTML(con) if 'ggzy' in url: xpath = "//div[@id='list_div']/div/div/a" elif 'drc' in url: xpath = "//div[@class='con-right']/div/div/a" elif 'hkjsj' in url: xpath = "//div[@class='con-right fr']/div/div/a" else: xpath = "//div[@class='list-c']/ul/li/p[1]/a" length = len(html_2.xpath(xpath)) + 1 po = 0 for page in range(1, pages+1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(1, length): # if 'plan' in url and i%6==0: # pass # else: lengt = len(html_1.xpath(xpath)) xpath1 = xpath.replace('/div/div/div/a', f'/div/div[{i}]/div/a').replace('tr/td[', f'tr[{i}]/td[').replace('ul/li', f'ul/li[{i}]') if 'ggzy' in url or 'www' in url or 'drc' in url or 'hkjsj' in url : href = html_1.xpath(f"{xpath1}/@href")[0].strip() title = html_1.xpath(f"{xpath1}/text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') if 'www' in url: publictime = html_1.xpath(xpath1.replace('[1]/a', "[2]/span") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间:', '') elif 'hkjsj' in url: publictime = html_1.xpath(xpath1.replace('div/a', "table/tbody/tr/td[1]") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间:', '') else: publictime = html_1.xpath(xpath1.replace('div/a', "div/span") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间:', '') else: href = html_1.xpath(f"{xpath1}/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t','').replace( '\r', '') publictime = html_1.xpath(f"{xpath1}/em/text()")[0].strip().replace('\n', '').replace('[', '').replace(']','').replace( '日', '').replace('/', '-') select = Mysql.select_xw_nr1(biaoti=title, dijishi=name) # 查询标题是否存在 if select == None: publictime_times = int(time.mktime(time.strptime(publictime.replace(' ', '').replace('[', '').replace(']',''), "%Y-%m-%d"))) # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d"))) if publictime_times >= jiezhi_time: if 'jxcq' in url: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) link = 'http://www.jxcq.org' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') else: chuli(publictime, href, driver, url, title, city, xpath1) else: po += 1 break if i == lengt: if lengt < length - 1: break else: if page != pages: try: driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click() except: try: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页')) except: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页')) break except Exception as e: print('海口\t', e) driver.close() return haikou(name)
def hainan(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions, executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.maximize_window() urls = { 'http://zw.hainan.gov.cn/ggzy/ggzy/tzgg/index.jhtml': 3, # 公共资源中心 通知公告 'http://zw.hainan.gov.cn/ggzy/ggzy/xwdt/index.jhtml': 1, # 公共资源中心 新闻动态 'https://www.hainan.gov.cn/hainan/0101/list3_1.shtml': 16, # 人民政府 公示公告 'https://www.hainan.gov.cn/common/search/2f11247a5c2c49eeb3cdbb00a8178bc7?_isAgg=false&_pageSize=12&_template=hainan&_channelName=&page=1': 314, # 人民政府 政务动态 'https://www.hainan.gov.cn/common/search/55acf8539596d25624059980986aaa78?_isAgg=false&_pageSize=12&_template=hainan&_channelName=&page=1': 326, # 人民政府 今日海南 'https://www.hainan.gov.cn/common/search/82cdb5b25e514a1bba6429aef621ce6c?_isAgg=false&_pageSize=12&_template=hainan&sort=publishedTime&_channelName=&page=1': 189, # 人民政府 省府要闻 'https://www.hainan.gov.cn/hainan/zxjd/list3.shtml': 2, # 人民政府 政策解读>最新解读 'http://plan.hainan.gov.cn/sfgw/zwdt/list3.shtml': 5, # 发改委 要闻动态 > 政务动态 'http://plan.hainan.gov.cn/sfgw/gzdt/list3.shtml': 20, # 发改委 工作动态 'http://plan.hainan.gov.cn/sfgw/zxdt/list3.shtml': 50, # 发改委 最新动态 'http://zjt.hainan.gov.cn/szjt/zwdt/tablist.shtml': 36, # 住建局 政务动态 'http://zjt.hainan.gov.cn/szjt/sxxx/iframelist_sx.shtml': 15, # 住建局 市县信息 } for url, pages in zip(urls.keys(), urls.values()): driver.get(url) con = driver.page_source html_2 = etree.HTML(con) if 'zw' in url: xpath = "//table[@class='newtable']/tbody/tr/td[2]/a" elif 'plan' in url: xpath = "//div[@class='Fivelist']/ul/li" elif 'zjt' in url: xpath = "//div[@class='con-right']/div/div/a" else: xpath = "//div[@class='cen-div-1 mar-t']/div/div/div/a" length = len(html_2.xpath(xpath)) + 1 po = 0 for page in range(1, pages+1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(1, length): if 'plan' in url and i%6==0: pass else: lengt = len(html_1.xpath(xpath)) xpath1 = xpath.replace('/div/div/div/a', f'/div/div[{i}]/div/a').replace('tr/td[', f'tr[{i}]/td[') if 'zw' in url or 'www' in url or 'zjt' in url: href = html_1.xpath(f"{xpath1}/@href")[0].strip() title = html_1.xpath(f"{xpath1}/text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') if 'zw' in url: publictime = html_1.xpath(xpath1.replace('[2]/a', "[3]") + "/text()")[0].strip().replace('/', '-').replace('\n', '') else: publictime = html_1.xpath(xpath1.replace('div/a', "table/tbody/tr/td") + "/text()")[0].strip().replace('\n', '').replace('/', '-').replace('发布时间: ', '') else: href = html_1.xpath(f"{xpath1}/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t','').replace( '\r', '') publictime = html_1.xpath(f"{xpath1}/em/text()")[0].strip().replace('\n', '').replace('[', '').replace(']','').replace( '日', '').replace('/', '-') select = Mysql.select_xw_nr1(biaoti=title, dijishi=name) # 查询标题是否存在 if select == None: publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d"))) # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d"))) if publictime_times >= jiezhi_time: if 'jxcq' in url: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) link = 'http://www.jxcq.org' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') else: chuli(publictime, href, driver, url, title, city, xpath1) else: po += 1 break if i == lengt: if lengt < length - 1: break else: if page != pages: try: driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click() except: try: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页')) except: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页')) break except Exception as e: print('海南\t', e) driver.close() return hainan(name)