def crawl(self): print('\n' ,'-' * 10, 'http://www.ahjinzhai.gov.cn', '-' * 10, '\n') self.total = self.i = 0 url = 'http://www.ahjinzhai.gov.cn/luan/site/tpl/2951?organId=6626851' try: self.browser.get(url) except TimeoutException: return -1 newsList = self.browser.find_elements_by_css_selector('div > ul > li') for item in newsList: dateTime = item.find_element_by_css_selector('li.date').text if dateTime in self.date: self.extract(item) else: break print('quantity:', self.total) if self.total > 0: crawlerfun.renameNew() crawlerfun.expire(self.date, self.d, self.projectName) return 'complete', self.source, 'ok' else: return 'complete', 'none', 'ok'
def doCrawl(self, url): self.i = 0 try: self.browser.get(url) except TimeoutException: return -1 while True: newsList = self.browser.find_elements_by_css_selector('div > ul > li') for item in newsList: dateTime = item.find_element_by_css_selector('li.date').text if dateTime in self.date: self.extract(item) else: break if self.i < len(newsList): break else: try: self.browser.find_element_by_partial_link_text('下一页').click() self.i = 0 except NoSuchElementException: break if self.total > 0: crawlerfun.renameNew() crawlerfun.expire(self.date, self.d, self.projectName) return self.total else: return 0
def doCrawl(self, key, account): self.i = 0 try: sleep(1) url = 'https://weixin.sogou.com/weixin?type=1&query=' + account + '&ie=utf8&s_from=input&_sug_=y&_sug_type_=' self.browser.get(url) if 'antispider' in self.browser.current_url: self.browser.refresh() sleep(5) if 'antispider' in self.browser.current_url: self.browser.quit() self.browser = startBrowser() self.browser.get(url) print('\n' + key + ': ' + account) except TimeoutException: return while True: newsList = self.browser.find_elements_by_css_selector( 'div.news-box > ul.news-list2 > li') for item in newsList: try: dateTime = item.find_element_by_css_selector( 'dl:last-child > dd > span').text except NoSuchElementException: continue if '前' in dateTime and '天前' not in dateTime: self.extract(item, account) else: continue if self.pageNum > 0: try: self.browser.find_element_by_partial_link_text( '下一页').click() except NoSuchElementException: break elif self.pageNum == 0: break if self.i > 0: crawlerfun.renameNew() crawlerfun.expire(self.date, self.d, self.projectName) return self.browser
def extract(self, item, account): titleInfo = item.find_element_by_css_selector('dd > a') title = titleInfo.text tag = title + '|' + account try: # href = titleInfo.get_attribute('href') md5 = crawlerfun.makeMD5(tag) link = '' # dict filter if md5 in self.d: return else: self.d[md5] = self.date.split(' ')[0] # 往dict里插入记录 self.i += 1 handle = self.browser.current_window_handle # 拿到当前页面的handle titleInfo.click() # switch tab window WebDriverWait(self.browser, 10).until(EC.number_of_windows_to_be(2)) handles = self.browser.window_handles for newHandle in handles: if newHandle != handle: self.browser.switch_to.window(newHandle) # 切换到新标签 sleep(2) # 等个几秒钟 self.source = self.getPageText() # 拿到网页源码 link = self.browser.current_url # 获取当前网页的链接 # self.bottomNews(self.browser, handle) # 底部3条信息 self.browser.close() # 关闭当前标签页 self.browser.switch_to.window(handle) # 切换到之前的标签页 break self.write_new_file(link, title, self.source, self.i, self.date, 1152937) except Exception as e: print('extract exception:', e) try: self.browser.refresh() except Exception as e: print('after refresh error: ', e, '-' * 10) self.i -= 1 crawlerfun.renameNew() crawlerfun.expire(self.date, self.d, self.projectName) raise Exception
def doCrawl(self, key, account): print('\nkey: ', key, '| account: ', account) self.i = 0 try: url = 'https://weixin.sogou.com/weixin?type=1&query=' + account + '&ie=utf8&s_from=input&_sug_=y&_sug_type_=' self.browser.get(url) # sleep(1) WebDriverWait(self.browser, 20).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.news-box > ul.news-list2 > li'))) except TimeoutException: return -1 while True: newsList = self.browser.find_elements_by_css_selector( 'div.news-box > ul.news-list2 > li') for item in newsList: try: dateTime = item.find_element_by_css_selector( 'dl:last-child > dd > span').text except: continue if '前' in dateTime and '天前' not in dateTime: self.extract(item, account) else: continue try: self.browser.find_element_by_partial_link_text( '下一页').click() # 点击下一页 except NoSuchElementException: break if self.total > 0: crawlerfun.renameNew() crawlerfun.expire(self.date, self.d, self.projectName) return self.total else: return 0
def extractSingle(self, item, firstHandle): titleInfo = item.find_element_by_css_selector( 'div > div.weui_ellipsis_mod_inner') title = titleInfo.text try: # href = item.get_attribute('data-url') md5 = crawlerfun.makeMD5(title) link = '' # dict filter if md5 in self.d: return else: self.d[md5] = self.date.split(' ')[0] # 往dict里插入记录 self.i += 1 handle = self.browser.current_window_handle # 拿到当前页面的handle titleInfo.click() # switch tab window WebDriverWait(self.browser, 10).until(EC.number_of_windows_to_be(3)) handles = self.browser.window_handles for newHandle in handles: if newHandle != handle and newHandle != firstHandle: self.browser.switch_to.window(newHandle) # 切换到新标签 sleep(2) # 等个几秒钟 self.source = self.getPageText() # 拿到网页源码 link = self.browser.current_url # 获取当前网页的链接 self.browser.close() # 关闭当前标签页 self.browser.switch_to.window(handle) # 切换到之前的标签页 break self.write_new_file(link, title, self.source, self.i, self.date, 1152937) except Exception as e: print('single error:', e, self.date) self.i -= 1 crawlerfun.renameNew() crawlerfun.expire(self.date, self.d, self.projectName) return