def zhuHai(self): url = urls[3] driver.get(url) if WebDriverWait(driver, 10, 1).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "[class='gl-con']"))): html = driver.page_source soup = bs4(html, 'lxml') temp = soup.find('div', class_="gl-con").find_all('li') print(len(temp)) for i in temp: info = [] name = i.get_text().replace('· ', '') if name[-1] == '区': continue if name[0] == '市': name = '珠海' + name info.append(name) info.append('广东省') info.append('珠海') info.append(datetime.datetime.now().strftime('%Y-%m-%d')) info.append(url) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) save2db().save2db(sql)
def guangDong(self): url = urls[0] driver.get(url) if WebDriverWait(driver, 10, 1).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "[class='glbox']"))): html = driver.page_source soup = bs4(html, 'lxml') temp = soup.find('div', class_="con LinkList").find_all('li') for i in temp: info = [] name = i.get_text() info.append(name) info.append('广东省') info.append('') info.append(datetime.datetime.now().strftime('%Y-%m-%d')) info.append(url) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) save2db().save2db(sql)
def shanTou(self): for i in range(1, 4): url = 'http://www.shantou.gov.cn/u/zzjg?ajax=&name=&deptType=' + str( i) + '&callback=handlerDept¤tPage=1&pageSize=40' html = req.get(url, headers=headers).content.decode('utf8') res = eval(html[12:-1]) for i1 in res['data']: name = i1['name'] info = [] if name[-1] == '区': continue if name[0] == '市': name = '汕头' + name info.append(name) info.append('广东省') info.append('汕头') info.append(datetime.datetime.now().strftime('%Y-%m-%d')) info.append(urls[4]) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) save2db().save2db(sql)
def guangZhou(self): url = urls[1] driver.get(url) if WebDriverWait(driver, 10, 1).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "[class='main_border SinglePage']"))): html = driver.page_source soup = bs4(html, 'lxml') temp = soup.find('div', class_="main_border SinglePage").find_all('td') print(len(temp)) for i in temp: info = [] name = i.get_text().replace('· ', '') info.append(name) info.append('广东省') info.append('广州') info.append(datetime.datetime.now().strftime('%Y-%m-%d')) info.append(url) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) save2db().save2db(sql)
def start_spider(self): url = self.get_newsUrl() if url is None: sys.exit() html = req.get(url, headers=headers).content.decode('utf8') soup = bs4(html, 'lxml') section_day = soup.find_all('section', id=re.compile("[0-9]*?")) section_monthYear = soup.find_all('section', class_="item_content") section_timelineHistory = soup.find_all('section', class_="tl_content_item") pageType = soup.find(class_="listview2").get_text() if pageType == '大事记365': # 多页 if len(self.page_re.findall(html)) > 0: pass else: for i in section_day: info = self.get_dayNewsDes(i, soup) info.append(url) info.append(datetime.datetime.now().strftime('%Y-%m-%d')) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) self.save2db(sql) if pageType == '月度事件' or pageType == '年度事件': # 多页 if len(self.page_re.findall(html)) > 0: urls = self.get_newsurlINpage(soup) for i in urls: html = req.get(i, headers=headers).content.decode('utf8') soup = bs4(html, 'lxml') section_monthYear = soup.find_all('section', class_="item_content") # 循环每一条新闻 for i in section_monthYear: info = self.get_monthYearNewsDes(i, soup) info.append(url) info.append( datetime.datetime.now().strftime('%Y-%m-%d')) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) self.save2db(sql) self.changeUrlType(i) else: for i in section_monthYear: info = self.get_monthYearNewsDes(i, soup) info.append(url) info.append(datetime.datetime.now().strftime('%Y-%m-%d')) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) self.save2db(sql) if pageType == '时间轴': # 多页 if len(self.page_re.findall(html)) > 0: urls = self.get_newsurlINpage(soup) for i in urls: html = req.get(i, headers=headers).content.decode('utf8') soup = bs4(html, 'lxml') section_timelineHistory = soup.find_all( 'section', class_="tl_content_item") # 循环每一条新闻 for i in section_timelineHistory: info = self.get_timelineNewsDes(i, soup) info.append(url) info.append( datetime.datetime.now().strftime('%Y-%m-%d')) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) self.changeUrlType(i) else: for i in section_timelineHistory: info = self.get_timelineNewsDes(i, soup) info.append(url) info.append(datetime.datetime.now().strftime('%Y-%m-%d')) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) if pageType == '历史事件': # 多页 if len(self.page_re.findall(html)) > 0: urls = self.get_newsurlINpage(soup) for i in urls: html = req.get(i, headers=headers).content.decode('utf8') soup = bs4(html, 'lxml') section_timelineHistory = soup.find_all( 'section', class_="tl_content_item") for i in section_timelineHistory: info = self.get_HistoryNewsDes(i, soup) info.append(url) info.append( datetime.datetime.now().strftime('%Y-%m-%d')) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) self.save2db(sql) self.changeUrlType(i) else: for i in section_timelineHistory: info = self.get_HistoryNewsDes(i, soup) info.append(url) info.append(datetime.datetime.now().strftime('%Y-%m-%d')) sql = whereNotInsert.main(tablename, t_eventField, info, 'name', info[0]) self.save2db(sql) self.changeUrlType(url)