def pingxiang2(): url='http://pxdpc.pingxiang.gov.cn/list.asp?classid=15' tt = requests.get(url).content.decode('utf-8') pages = re.findall('每页20条, 1/(\d+)页', tt)[0] print(f'共{pages}页') for page in range(1, int(pages) + 1): url1=url+f'&p={page}' tt = requests.get(url1).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '') contents = re.findall(' <a href="(.*?)" target="_blank">(.*?)</a></td> <td width="11%" class="font_hui12">\[(.*?)\]</td>', tt) for content in contents: linkurl = 'http://pxdpc.pingxiang.gov.cn/' + content[0] detail_res = requests.get(linkurl).content.decode('utf-8').replace('/upload/','http://pxdpc.pingxiang.gov.cn/upload/') Html = etree.HTML(detail_res) # qufen = '发改委'+Html.xpath("//table[1]/tbody/tr/td[@class='font_hui12']/a[3]")[0] # 当前栏目 div1 = Html.xpath("/html/body/div[5]")[0] # text infocontent = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace( '"', ' ') # html title = content[1] publicTime = content[2].replace(' ','') select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select)==0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='发改委', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print(f'第{page}页标题存在') print(f'第{page}页已爬完')
def zhangjiakou(): try: for page in range(1, 374): url1s = [ f'http://www.zjk.gov.cn/syscolumn/dt/zjkyw/index.html', # 张家口要闻 f'http://www.zjk.gov.cn/syscolumn/dt/zjkyw/index_{page}.html', # 张家口要闻 f'http://www.zjk.gov.cn/bmgz_frame1.jsp?pages={page}', # 部门工作 ] for url1 in url1s: contents1 = requests.get( url1, proxies=ipmax()).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = [ re.findall( '"hg" href="(.*?)" target="_blank" title="(.*?)">(.*?)</a></td> <td width="80" class="cdate">\[(.*?)\]</td>', contents1), re.findall( 'hg" href="(.*?)" title="(.*?)" target="_blank">(.*?)</a></td> <td width="100" class="cdate">\[(.*?)\]</td>', contents1), ] for content in contents: if len(content) > 0: content = content[0] uu = re.findall('www.(.*?).gov', url1)[0] linkurl = f'http://www.{uu}.gov.cn' + content[0].strip( ) detail_res = requests.get(linkurl).content.decode( 'utf-8') Html = etree.HTML(detail_res) infocontent = html.unescape( etree.tostring(Html, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1].strip() publicTime = content[3].strip() select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if select == None: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='067000', regionName='河北省', areaRegion='承德市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo( uid=uid, infocontent=infocontent) else: print('标题存在') except Exception as e: print('蚌埠\t', e) return zhangjiakou()
def shijiazhuang(): url1s = [ # 'http://www.sjz.gov.cn/column.jsp?id=1490076462404', # 市政要闻 'http://www.sjz.gov.cn/column.jsp?id=1490076534390', # 部门动态 'http://www.sjz.gov.cn/column.jsp?id=1490076571666', # 区县动态 ] for url1 in url1s: tt = requests.get(url1).content.decode('gb2312') pages = re.findall("title='每页显示.*记录'>共.*条(\d+)页", tt)[0] for page in range(1, int(pages) + 1): url = f'{url1}¤t={page}' contents1 = requests.get(url1).content.decode('gb2312').replace( '\n', '').replace('\r', '').replace('\t', '') contents2 = re.findall( '1 list_2"><ul>(.*?)/ul></div></div><div style="text-align:', contents1) contents = re.findall( 'href="(.*?)" target="_blank" style="line-height:30px;" title="(.*?)">(.*?)</a> <span class="date" style="color:#898989">(.*?)</span>', contents2[0]) for content in contents: linkurl = 'http://www.sjz.gov.cn' + content[0] detail_res = requests.get(linkurl).content.decode('gb2312') Html = etree.HTML(detail_res) div = Html.xpath("/html/body/div/div[2]")[0] infocontent = html.unescape( etree.tostring(div, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1] publicTime = content[3] select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if select == None: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='050000', regionName='河北省', areaRegion='石家庄市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print('标题存在') print('gg')
def chengde(): for page in range(1, 374): url1s = [ f'http://www.chengde.gov.cn/col/col360/index.html?uid=1412&pageNum={page}', # 本市要闻 1361 # 'http://www.chengde.gov.cn/col/col361/index.html?uid=1412&pageNum={page}', # 外媒看承德 367 # 'http://www.chengde.gov.cn/col/col362/index.html?uid=1412&pageNum={page}', # 外媒看承德 374 # 'http://www.chengde.gov.cn/col/col364/index.html?uid=1412&pageNum={page}', # 公示公告 27 ] for url1 in url1s: contents1 = requests.get(url1).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = re.findall('pan><a (.*?)</span>', contents1) for content in contents: co = re.findall("href=\\'(.*?)\\'title=\\'(.*?)\\'target", content)[0] co1 = re.findall( 'target="_blank">(.*?)</a><span class="bt-data-time"style="font-size:14px;">\[(.*?)\]', content)[0] linkurl = 'http://www.chengde.gov.cn' + co[0] detail_res = requests.get(linkurl).content.decode('utf-8') Html = etree.HTML(detail_res) # div = Html.xpath("/html/body/div/div[2]")[0] infocontent = html.unescape( etree.tostring(Html, method='html').decode()).replace( "'", " ").replace('"', ' ') title = co[1] publicTime = co1[1] select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if select == None: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='067000', regionName='河北省', areaRegion='承德市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print('标题存在')
def shengyw(): url = 'http://www.hebei.gov.cn/hebei/13863674/13871225/index.html' tt = requests.get(url).content.decode('utf-8') pages = re.findall('totalpage="(\d+)"', tt)[0] for page in range(1, int(pages) + 1): url1 = f'http://www.hebei.gov.cn/eportal/ui?pageId=13871225¤tPage={page}' tt = requests.get(url1).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = re.findall( '<a href="(.*?)" onclick="void\(0\)" target="_blank" title="(.*?)" istitle="true">(.*?)</a> <span class="date" style="font-size: 12px;color: #898989;padding-left: 5px;">(.*?)</span> </li>', tt) for content in contents: linkurl = 'http://www.hebei.gov.cn' + content[0] detail_res = requests.get(linkurl).content.decode('utf-8') Html = etree.HTML(detail_res) div = Html.xpath( '//*[@id="fadd83fc626241d9937b20353ca675eb"]/div[2]')[0] infocontent = html.unescape( etree.tostring(div, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1] publicTime = content[3] select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select) == 0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='050000-075000', regionName='河北省', areaRegion='河北省', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print('标题存在')
def guo(): # 国务院新闻 url = 'http://sousuo.gov.cn/column/19423/0.htm' tt = requests.get(url).content.decode('utf-8') pages = re.findall('共(\d+)页', tt)[0] for page in range(int(pages)): url1 = f'http://sousuo.gov.cn/column/19423/{page}.htm' tt1 = requests.get(url1).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = re.findall( '<li><h4><a href="(.*?)" target="_blank">(.*?)</a><span class="date">(.*?)</span></h4></li>', tt1) for content in contents: linkurl = content[0] detail_res = requests.get(linkurl).content.decode('utf-8') Html = etree.HTML(detail_res) div = Html.xpath('/html/body/div[3]/div[2]/div[1]')[0] infocontent = html.unescape( etree.tostring(div, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1] publicTime = content[2] select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select) == 0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='000000', regionName='国务院', areaRegion='全国', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print('标题存在')
def pingxiang(): try: for num in range(1,4): url=f'http://www.jxsggzy.cn/web/xwzx/00700{num}/1.html' tt = requests.get(url).content.decode('utf-8') pages = re.findall('id="index">1/(\d+)</span>', tt)[0] print(f'江西省公共交易中心共{pages}页') for page in range(1, int(pages) + 1): url1=url.replace('1.html',f'{page}.html') tt = requests.get(url1).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '') contents = re.findall('<li class="ewb-list-node clearfix"> <a href="(.*?)" title="(.*?)" target="_blank" class="ewb-list-name">(.*?)</a> <span class="ewb-list-date">(.*?)</span> ', tt) for con in range(1,len(contents)): content=contents[con] title = content[1] publicTime = content[3] linkurl = 'http://www.jxsggzy.cn' + content[0] if re.findall('pdf|doc',content[0]): infocontent='<embed src="'+linkurl+'" >' urllib.request.urlretrieve(quote(linkurl, safe='/:?='), r'D:\lm\xinwen\江西省公共资源交易中心\\' + title + '.jpg') else: detail_res = requests.get(linkurl).content.decode('utf-8') Html = etree.HTML(detail_res) qufen='江西省公共交易中心'+Html.xpath("//p[@class='ewb-location-content']/span/text()")[0] infocontent = html.unescape(etree.tostring(Html, method='html').decode()).replace("'", " ").replace( '"', ' ') # html select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select)==0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='江西省公共交易中心', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) print(f'{num} 标题【{title}】写入成功') else: print(f'{num} 标题【{title}】存在') print('-'*50+f'{num} 江西省公共交易中心第{page}页已写完'+'-'*50) except Exception as e: print('蚌埠\t', e) return pingxiang()
def ganzhou(): try: url1s=[ 'http://www.ganzhou.gov.cn/c100022/list.shtml', # 政务动态 'http://www.ganzhou.gov.cn/c100023/list.shtml', # 通知公告 # 'http://www.ganzhou.gov.cn/c100024/list_bmqx.shtml', # 部门动态 # 'http://www.ganzhou.gov.cn/c100025/list_bmqx.shtml', # 区县动态 # 'http://www.ganzhou.gov.cn/c100026/list.shtml', # 便民提示 # 'http://www.ganzhou.gov.cn/c100027/list.shtml', # 央网推荐 # 'http://www.ganzhou.gov.cn/c100028/list.shtml', # 省网推荐 # 'http://www.ganzhou.gov.cn/c100029/list.shtml', # 市外媒体 # 'http://www.ganzhou.gov.cn/c100030/list.shtml', # 新闻发布会 # 'http://www.ganzhou.gov.cn/c100032/list.shtml', # 专题专栏 ] for url1 in url1s: print("程序已启动,稍等几秒") for page in range(1,37): if page==1: tt = requests.get(url1,proxies=ipmax()).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '') else: url2=url1.replace('list.shtml',f'list_{page}.shtml').replace('bmqx.shtml',f'bmqx_{page}.shtml') tt = requests.get(url2).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '') contents1 = re.findall('<div class="bd">(.*?)text/javascript', tt) contents = re.findall('<li><a href="(.*?)" target="_blank" title=\'(.*?)\' >(.*?)</a><span>(.*?)</span>',contents1[0]) for content in contents: if re.findall('mp.weixin',content[0]): linkurl=content[0] # detail_res = requests.get(linkurl,proxies=ipmax()).content.decode('utf-8') # Html = etree.HTML(detail_res) # div = Html.xpath("//div[@id='page-content']")[0] # infocontent = html.unescape(etree.tostring(div, method='html').decode()).replace("'", " ").replace( '"', ' ') else: linkurl = 'http://www.ganzhou.gov.cn' + content[0] # detail_res = requests.get(linkurl,proxies=ipmax()).content.decode('utf-8') # Html = etree.HTML(detail_res) # div = Html.xpath('/html/body/div[4]')[0] # infocontent = html.unescape(etree.tostring(div, method='html').decode()).replace("'", " ").replace('"', # ' ') title = content[1].replace(':',':') publicTime = content[3] s = publicTime.replace('/', '-') t = int(datetime.strptime(s, '%Y-%m-%d').timestamp()) if t >= 1570896000: select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select) == 0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='赣州市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='人民政府', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent='') print(f'标题【{title}】写入成功') else: print(f'标题【{title}】存在') else: break print('-' * 50 + f'赣州市第{page}页已完成' + '-' * 50) # chromeOptions = webdriver.ChromeOptions() # chromeOptions.add_experimental_option('w3c', False) # chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) # chromeOptions.add_argument('--headless') # 隐藏浏览器 # # chromeOptions.add_argument(f'--proxy-server={ipmax()}') # driver = webdriver.Chrome(options=chromeOptions, executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') # driver.get(url=url1) # aoo_11 = driver.page_source # html # pages=re.findall('总共(\d+)页',aoo_11) # print(f'共{pages[0]}页') # for aa in range(1, int(pages[0])): # if driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']"): # driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']").clear() # 清除文本框内容 # else: # driver.find_element_by_xpath("//input[@id='ctl00$ContentPlaceHolder1$AspNetPager1_input']").clear() # 清除文本框内容 # driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[@id='CP']").send_keys(aa) # 搜索框输入内容 # driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[2]").click() # 点击一下按钮 # # aoo_1 = driver.page_source # html # html_1 = etree.HTML(aoo_1) # list_num = html_1.xpath(f"//table/tbody/tr[1]/td[@class='font_hei14']/a") # 详情url # for i in range(1, len(list_num)+1): # 一页20条数据 # qufen ='人民政府'+html_1.xpath(f"/html/body/table[3]/tbody/tr/td[3]/table/tbody/tr[1]/td/a[4]/text()")[0].strip() # 区分 # link = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/@href")[0].strip() # 详情url # title = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/text()")[0].strip() # 标题 # publicTime = html_1.xpath(f"//tr[2]/td[@class='borderhui']/table[{i}]/tbody/tr[1]/td[@class='font_hui12']/text()")[0].strip().replace('\n','') .replace('[','') .replace(']','') .replace(' ','') # 时间 # s = publicTime.replace('/', '-') # t = int(datetime.strptime(s, '%Y-%m-%d').timestamp()) # if t >= 1570896000: # # if re.findall('xinhuan',link): # linkurl=link # else: # linkurl = url1 + link[1:] # url # driver.find_element_by_xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a").click() # driver.switch_to.window(driver.window_handles[-1]) # detail_res=driver.page_source # Html = etree.HTML(detail_res) # if Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']"): # div1 = Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']")[0] # 当前栏目 # div2 = Html.xpath("//table[@class='borderhui']/tbody/tr/td")[0] # text # elif Html.xpath("//div/div/div[@class='news-position']"): # div1 = Html.xpath("//div/div/div[@class='news-position']")[0] # 当前栏目 # div2 = Html.xpath("//div/div/div[@id='p-detail']")[0] # text # elif Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']"): # div1 = Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']")[0] # 当前栏目 # div2 = Html.xpath("//div[@class='article oneColumn pub_border']")[0] # text # else: # div1 = Html.xpath("//div[@class='xl-main']/div[@class='container']")[0] # 当前栏目 # div2 = '' # text # try: # infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace( # '"', ' ') # html # infocontent2 = html.unescape(etree.tostring(div2, method='html').decode()).replace("'", " ").replace( # '"', ' ') # html # infocontent=infocontent1+infocontent2 # except: # infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace( '"', ' ') # html # infocontent=infocontent1 # if re.findall('src="(.*?)" oldsrc=',infocontent): # infocontent=infocontent.replace('src=.\./',url1+link[1:7]+'/') # else:infocontent=infocontent # select = Mysql.select_xinwen(title=title) # 查询标题是否存在 # if len(select)==0: # uid = uuid.uuid4() # Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市', # publicTime=publicTime, linkurl=linkurl, title=title, # dataResource='', yewuType='人民政府', infoType='', infoState='', isok='', # isdeal='') # Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) # print(f'标题【{title}】写入成功') # # else: # print(f'标题【{title}】存在') # driver.back() # 返回上一页 # time.sleep(1) # print('-' * 50 + f'萍乡第{aa}页已完成' + '-' * 50) except Exception as e: print('蚌埠\t', e) return ganzhou()
def pingxiang1(): try: url1s=[ # 'http://www.pingxiang.gov.cn/xw/pxyw/zwyw1/', # 政务要闻 25页 # 'http://www.pingxiang.gov.cn/xw/pxyw/ldyl/', # 领导言论 16页 # 'http://www.pingxiang.gov.cn/xw/pxyw/zyhy/', # 重要会议 18页 # 'http://www.pingxiang.gov.cn/xw/pxyw/zyhy_44485/', # 专题会议 # 'http://www.pingxiang.gov.cn/xw/pxyw/bmdt/', # 部门动态 # 'http://www.pingxiang.gov.cn/xw/pxyw/xqxw/', # 区县新闻 # 'http://www.pingxiang.gov.cn/xw/pxyw/mrzw/', # 每日政务 'http://www.pingxiang.gov.cn/xw/pxyw/tpxw/', # 图片新闻 ] for url1 in url1s: print("程序已启动,稍等几秒") chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 # chromeOptions.add_argument(f'--proxy-server={ipmax()}') driver = webdriver.Chrome(options=chromeOptions, executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.get(url=url1) aoo_11 = driver.page_source # html pages=re.findall('总共(\d+)页',aoo_11) print(f'共{pages[0]}页') for aa in range(1, int(pages[0])): if driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']"): driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']").clear() # 清除文本框内容 else: driver.find_element_by_xpath("//input[@id='ctl00$ContentPlaceHolder1$AspNetPager1_input']").clear() # 清除文本框内容 driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[@id='CP']").send_keys(aa) # 搜索框输入内容 driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[2]").click() # 点击一下按钮 aoo_1 = driver.page_source # html html_1 = etree.HTML(aoo_1) list_num = html_1.xpath(f"//table/tbody/tr[1]/td[@class='font_hei14']/a") # 详情url for i in range(1, len(list_num)+1): # 一页20条数据 link = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/@href")[0].strip() # 详情url title = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/text()")[0].strip() # 标题 publicTime = html_1.xpath(f"//tr[2]/td[@class='borderhui']/table[{i}]/tbody/tr[1]/td[@class='font_hui12']/text()")[0].strip().replace('\n','') .replace('[','') .replace(']','') .replace(' ','') # 时间 # tt = int(time.time()) s = publicTime.replace('/', '-') t = int(datetime.strptime(s, '%Y-%m-%d').timestamp()) if t >= 1570896000: if re.findall('xinhuan',link): linkurl=link else: linkurl = url1 + link[1:] # url driver.find_element_by_xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a").click() driver.switch_to.window(driver.window_handles[-1]) detail_res=driver.page_source Html = etree.HTML(detail_res) if Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']"): div1 = Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']")[0] # 当前栏目 div2 = Html.xpath("//table[@class='borderhui']/tbody/tr/td")[0] # text elif Html.xpath("//div/div/div[@class='news-position']"): div1 = Html.xpath("//div/div/div[@class='news-position']")[0] # 当前栏目 div2 = Html.xpath("//div/div/div[@id='p-detail']")[0] # text elif Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']"): div1 = Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']")[0] # 当前栏目 div2 = Html.xpath("//div[@class='article oneColumn pub_border']")[0] # text else: div1 = Html.xpath("//div[@class='xl-main']/div[@class='container']")[0] # 当前栏目 div2 = '' # text try: infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace( '"', ' ') # html infocontent2 = html.unescape(etree.tostring(div2, method='html').decode()).replace("'", " ").replace( '"', ' ') # html infocontent=infocontent1+infocontent2 except: infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace( '"', ' ') # html infocontent=infocontent1 if re.findall('src="(.*?)" oldsrc=',infocontent): infocontent=infocontent.replace('src=.\./',url1+link[1:7]+'/') else:infocontent=infocontent select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select)==0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='人民政府', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) print(f'标题【{title}】写入成功') else: print(f'标题【{title}】存在') driver.back() # 返回上一页 time.sleep(1) print('-' * 50 + f'萍乡第{aa}页已完成' + '-' * 50) except Exception as e: print('蚌埠\t', e) return pingxiang1()
def tangshan(): try: url1s = [ f'http://www.tangshan.gov.cn/zhuzhan/zhengwuxinwen/index.html', # 政务新闻 # f'http://www.qhd.gov.cn/front_pcsec.do?tid=BE16A305B662511F9C82516BD16F3C24&p=1', # 部门动态 # f'http://www.qhd.gov.cn/front_pcsec.do?tid=677638128C3E53D4C629F745917A4CD8&p=1', # 县区动态 ] for url1 in url1s: contents1 = requests.get(url1).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') pages = int(re.findall('index_(\d+).html">尾页', contents1)[0]) for page in range(2, pages + 1): if page == 1: url = url1 else: url = url1.replace('index.html', f'index_{page}.html') contents2 = requests.get(url).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = [ re.findall( '<li><span class="title"><a href="(.*?)" target="_blank" >(.*?)</a></span><span class="date">(.*?)</span><span class="clear"></span></li>', contents2), re.findall( '</span> <a href="(.*?)" target="_blank">(.*?)</a></div><div class="seclisttime fl">(.*?)</div></div>', contents2), ] for content in contents: if len(content) > 0: content = content[0] linkurl = f'http://www.tangshan.gov.cn' + content[ 0].strip() detail_res = requests.get(linkurl).content.decode( 'utf-8') Html = etree.HTML(detail_res) infocontent = html.unescape( etree.tostring(Html, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1].strip() publicTime = content[2].strip() select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if select == None: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='063000', regionName='河北省', areaRegion='唐山市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo( uid=uid, infocontent=infocontent) else: print('标题存在') except Exception as e: print('蚌埠\t', e) return tangshan()
def qinhuangdao(): try: url1s = [ # f'http://www.qhd.gov.cn/front_pcsec.do?tid=A44A512C86E7FA51FEB2B9B098047A46&p=1', # 本地动态 # f'http://www.qhd.gov.cn/front_pcsec.do?tid=BE16A305B662511F9C82516BD16F3C24&p=1', # 部门动态 f'http://www.qhd.gov.cn/front_pcsec.do?tid=677638128C3E53D4C629F745917A4CD8&p=1', # 县区动态 ] for url1 in url1s: contents1 = requests.get(url1).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') pages = int(re.findall('共(\d+)页', contents1)[0]) for page in range(1, pages + 1): url = url1.replace('p=1', f'p={page}') contents2 = requests.get(url).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = [ re.findall( 'fl"><a href="(.*?)" target="_blank">(.*?)</a></div><div class="seclisttime fl">(.*?)</div></div>', contents2), re.findall( '</span> <a href="(.*?)" target="_blank">(.*?)</a></div><div class="seclisttime fl">(.*?)</div></div>', contents2), ] for content in contents: if len(content) > 0: content = content[0] linkurl = f'http://www.qhd.gov.cn/' + content[0].strip( ) detail_res = requests.get(linkurl).content.decode( 'utf-8') Html = etree.HTML(detail_res) infocontent = html.unescape( etree.tostring(Html, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1].strip() publicTime = content[2].strip() select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if select == None: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='067000', regionName='河北省', areaRegion='承德市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo( uid=uid, infocontent=infocontent) else: print('标题存在') except Exception as e: print('蚌埠\t', e) return qinhuangdao()