def crawl_main_page(self, keyword): try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until(ec.presence_of_element_located((By.ID, 'query'))) except: CustomLogging.log_to_file('搜狗搜索打开失败', LogType.ERROR) # 高级设置 elem = self.driver.find_element_by_id('settings') ActionChains(self.driver).move_to_element(elem).perform() time.sleep(1) self.driver.find_element_by_id('advanced-search').click() self.driver.find_element_by_xpath('//input[@name="q"]').send_keys( keyword) self.driver.find_element_by_xpath( '//input[@name="sitequery"]').send_keys(self.site) self.driver.find_element_by_xpath( '//a[@uigs-id="adv_time-sort"]').click() self.driver.find_element_by_xpath( '//input[@id="adv-search-btn"]').click() # search_keyword = '{0} site:{1}'.format(keyword, self.site) # self.driver.find_element_by_id('query').click() # self.driver.find_element_by_id('query').send_keys(search_keyword + Keys.ENTER) return self.crawl_search_results()
def crawl_main_page(self, keyword): try: self.driver.set_page_load_timeout(10) self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') pass try: self.driver.find_element_by_class_name('snp-btn-close').click() except NoSuchElementException: pass try: self.wait.until( ec.presence_of_element_located((By.ID, 'suggest01_input'))) except: CustomLogging.log_to_file('新浪财经页面打开失败', LogType.ERROR) # self.driver.find_elements_by_class_name('ds_button')[-1].click() self.driver.execute_script( 'document.getElementsByClassName("ds_button")[1].click();') self.driver.find_elements_by_class_name('dsl_cont')[ -1].find_element_by_xpath('.//p[contains(text(), "新闻")]').click() self.driver.find_element_by_id('suggest01_input').click() self.driver.find_element_by_id('suggest01_input').send_keys(keyword + Keys.ENTER) self.crawl_search_results()
def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={'id': 'ctrlfscont'}).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) return
def crawl_main_page(self, keyword): self.driver.get(self.url) try: self.wait.until(ec.presence_of_element_located((By.ID, 'keyword'))) except: CustomLogging.log_to_file('人民网主页打开失败', LogType.ERROR) self.driver.find_element_by_id('keyword').clear() self.driver.find_element_by_id('keyword').send_keys(keyword + Keys.ENTER) self.crawl_search_results()
def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={'id': 'rwb_zw'}).text # rwb_zw return full_content except Exception: try: full_content = bs.find('div', attrs={'class': re.compile('(show_text)|(con)|(gray box_text)')}) return full_content except: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) return
def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find( 'div', attrs={ 'id': re.compile('(Cnt-Main-Article-QQ)|(ArticleCnt)') }).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) pass
def download_as_html(url, encoding='utf-8'): # proxies = {'http': 'CNCKGTMG01.atrapa.deloitte.com:80', 'https': 'CNCKGTMG01.atrapa.deloitte.com:80'} # auth = HttpNtlmAuth(r'atrapa\emmayli', 'Gnw201807!') session.mount(url, retry_adr) try: r = session.get(url) if r.encoding != 'ISO-8859-1' or (r.encoding == 'ISO-8859-1' and encoding == 'gbk'): r.encoding = encoding return {'text': r.text, 'url': r.url, 'encoding': r.encoding} except: CustomLogging.log_to_file('download failed: {0}'.format(url), LogType.ERROR) return ''
def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find( 'div', attrs={ 'class': re.compile('(ArticleBlogText)|(art_contextBox)|(article)') }).text # ArticleBlogText #art_contextBox #article return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) pass
def write_to_excel(self, filepath): # self.db_handler.close_conn() article_list = [] while not self.q.empty(): item = self.q.get() article_list.append( (self.name, item.url, item.title, item.short_description, item.publish_date, item.full_content)) if article_list: df = pd.DataFrame(article_list) xl_writer = pd.ExcelWriter(filepath, engine='openpyxl') if os.path.exists(filepath): wb = load_workbook(filepath) xl_writer.book = wb xl_writer.sheets = dict([(ws.title, ws) for ws in wb.worksheets]) df.to_excel(xl_writer, index=False, sheet_name=self.name, header=[ 'source', 'url', 'title', 'short_desc', 'issue_date', 'full_content' ]) try: xl_writer.save() except PermissionError: xl_writer_new = pd.ExcelWriter(filepath.replace( '.xlsx', '{0}.xlsx'.format(str(int(time.time())))), engine='openpyxl') if os.path.exists(filepath): wb = load_workbook(filepath) xl_writer_new.book = wb xl_writer_new.sheets = dict([(ws.title, ws) for ws in wb.worksheets]) df.to_excel(xl_writer_new, index=False, sheet_name=self.name, header=[ 'source', 'url', 'title', 'short_desc', 'issue_date', 'full_content' ]) xl_writer_new.save() else: print('找不到{0}年内任何与关键字 "{1}" 相关的内容'.format(self.year_range, self.keyword)) CustomLogging.log_to_file('搜索不到匹配记录', LogType.INFO) pass
def crawl_search_results(self): search_results = [] try: self.wait.until(ec.presence_of_element_located((By.ID, 'searchlist'))) except TimeoutException: CustomLogging.log_to_file('第一财经网搜索结果页错误', LogType.ERROR) exit_flag = 0 start_index = 0 while True: try: self.wait.until(ec.presence_of_all_elements_located((By.CLASS_NAME, 'f-db'))) except TimeoutException: CustomLogging.log_to_file('文章列表加载失败', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name('f-db') for each_article in result_articles[start_index:]: item = Entity() item.publish_date = \ each_article.find_element_by_class_name('author').find_elements_by_tag_name('span')[ -1].text if not in_date_range(conv_pub_date(item.publish_date, 'yicai'), self.year_range): exit_flag = 1 # 跳出for循环 break item.title = each_article.find_element_by_tag_name('h2').text item.short_description = each_article.find_element_by_tag_name('p').text if self.keyword not in item.title and self.keyword not in item.short_description: continue item.url = each_article.get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item,)).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.ERROR) pass try: # next_page = self.wait.until(ec.visibility_of_element_located( # (By.XPATH, '//button[@class="u-btn" and contains(text(), "加载更多内容")]'))) # next_page = self.driver.find_element_by_xpath('//button[@class="u-btn" and contains(text(), "加载更多内容")]') # next_page.click() self.driver.execute_script('document.getElementsByClassName("u-btn")[0].click()') time.sleep(2) start_index += 20 except TimeoutException: CustomLogging.log_to_file('全部页面加载完成', LogType.INFO) break return search_results
def crawl_main_page(self, keyword): try: self.driver.get(self.url) except TimeoutException: pass try: self.wait.until(ec.presence_of_element_located((By.ID, 'contentInput_0'))) except: CustomLogging.log_to_file('证券时报网页面打开失败,不能定位搜索框元素', LogType.ERROR) self.driver.find_element_by_id('contentInput_0').clear() self.driver.find_element_by_id('contentInput_0').send_keys(keyword + Keys.ENTER) self.crawl_search_results()
def crawl_main_page(self, keyword): self.driver.get(self.url) try: self.wait.until( ec.presence_of_element_located((By.CLASS_NAME, 'ifengSS'))) except: CustomLogging.log_to_file('凤凰财经网页面打开失败', LogType.ERROR) # self.driver.execute_script('document.getElementsByClassName("btn-1NI76BXl clearfix")[0].click()') self.driver.find_element_by_class_name('ifengSS').click() self.driver.find_element_by_class_name('ifengSS').send_keys(keyword + Keys.ENTER) self.crawl_search_results()
def crawl_main_page(self, keyword): try: self.driver.get(self.url) except: CustomLogging.log_to_file('加载页面过慢,停止加载,继续下一步操作', LogType.INFO) self.driver.execute_script('window.stop()') try: self.wait.until(ec.presence_of_element_located((By.ID, 'searchkeys'))) except: CustomLogging.log_to_file('第一财经网页面打开失败', LogType.ERROR) self.driver.find_element_by_id('searchkeys').clear() self.driver.find_element_by_id('searchkeys').send_keys(keyword + Keys.ENTER) self.crawl_search_results()
def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={ 'class': 'g-articl-text' }).text return full_content except Exception: try: full_content = bs.find('article').text return full_content except: # CustomLogging.log_to_file('{0}\n'.format(traceback.format_exc()), LogType.ERROR) CustomLogging.log_to_file( '页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) return ''
def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') [s.extract() for s in bs('script')] try: full_content = bs.find( 'div', attrs={ 'class': re.compile( '(main-text)|(article-con)|(post-detail-text )|(rich_media_content )|(art_main)' ) }).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR)
def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={ 'class': re.compile('(article-content)|(pcb)') }).text return full_content except Exception: try: full_content = bs.find('div', attrs={'id': 'news_main'}).text return full_content except Exception: CustomLogging.log_to_file( '页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR)
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('东方财富网搜索页面加载失败', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() item.title = each_article.find_element_by_tag_name( 'a').text item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') item.short_description = each_article.find_element_by_class_name( 'des').text item.publish_date = each_article.find_element_by_class_name( 'g').text threading.Thread(target=self.download_and_save_item, args=(each_article, )).start() except NoSuchElementException: print('没有搜索结果') break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: print('已经是最后一页') break return search_results
def crawl_search_results(self): exit_flag = 0 index = 0 while True: try: self.wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'fr'))) except TimeoutException: CustomLogging.log_to_file('人民网搜索结果页面加载失败', LogType.ERROR) CustomLogging.log_to_file(traceback.format_exc(), LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath('//div[@class="fr w800"]//ul') for each_article in result_articles: item = Entity() pub_date = each_article.find_elements_by_tag_name('li')[2].text item.publish_date = re.search(re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d:[0-5]\d'), pub_date).group() if not in_date_range(conv_pub_date(item.publish_date, 'peoplecn'), self.year_range): exit_flag = 1 # 跳出for循环 break item.title = each_article.find_element_by_tag_name('a').text item.short_description = each_article.find_elements_by_tag_name('li')[1].text if self.keyword not in item.short_description and self.keyword not in item.title: continue item.url = each_article.find_element_by_tag_name('a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item,)).start() except NoSuchElementException: break if exit_flag == 1: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="show_nav_bar"]//a[contains(text(), "下一页")]') next_page.click() time.sleep(2) except NoSuchElementException: break
def crawl_main_page(self, keyword): self.driver.set_page_load_timeout(5) try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop()') try: self.wait.until( ec.presence_of_element_located((By.ID, 'textMessage'))) except: CustomLogging.log_to_file('和讯财经网打开失败', LogType.ERROR) self.driver.find_element_by_id('textMessage').clear() self.driver.find_element_by_id('textMessage').send_keys(keyword + Keys.ENTER) return self.crawl_search_results()
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() try: self.wait.until( ec.presence_of_element_located((By.CLASS_NAME, 'result'))) except TimeoutException: CustomLogging.log_to_file('中国经济网搜索结果页面加载失败', LogType.ERROR) while True: try: result_articles = self.driver.find_elements_by_class_name( 'res-list') for each_article in result_articles: item = Entity() item.title = each_article.find_element_by_class_name( 'res-title').text item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') try: item.short_description = each_article. \ find_element_by_xpath('./div[@class="res-rich so-rich-news clearfix"]//*').text item.publish_date = each_article.find_element_by_class_name( 'gray').text continue except NoSuchElementException: pass try: item.short_description = each_article.find_element_by_class_name( 'res-desc').text item.publish_date = '' except NoSuchElementException: item.short_description = '' item.publish_date = '' threading.Thread(target=self.download_and_save_item, args=(item, )).start() except NoSuchElementException: pass return search_results
def crawl_main_page(self, keyword): self.driver.set_page_load_timeout(10) try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until( ec.presence_of_element_located((By.ID, 'searchKey'))) except: CustomLogging.log_to_file('北京工商局网站打开失败', LogType.ERROR) self.driver.find_element_by_id('searchKey').click() self.driver.find_element_by_id('searchKey').send_keys(keyword) self.driver.find_element_by_xpath( '//span[contains(text(), "我要搜索")]').click() return self.crawl_search_results()
def download_and_save_item(self, item): ''' 根据item对象里的url下载html页面 :param item: 新闻文章对应的实体 :return: ''' print('downloading url: %s' % item.url) try: if self.name in ( '人民网', '中国证券网', '中国经济网', '腾讯财经', ) or (self.name == '千龙网' and ('qndj' in item.url or 'bbs' in item.url)): ret = download.download_as_html(item.url, encoding='gbk') html = ret['text'].encode('gbk', 'ignore') elif self.name in ('每经网', ): ret = download.download_as_html(item.url) html = ret['text'].encode('utf-8', 'ignore') elif self.name in ('同花顺', ): ret = download.download_as_html(item.url, encoding='gbk') html = ret['text'].encode('gbk', 'ignore') else: ret = download.download_as_html(item.url) if ret['encoding'] == 'ISO-8859-1': html = ret['text'].encode('ISO-8859-1', 'ignore') else: html = ret['text'] if html: item.url = ret['url'] item.full_content = self.parse_html(item.url, html) # 存储 # self.db_handler.save_article_info(item, self.case_id, ) self.q.put(item) else: return except TypeError: CustomLogging.log_to_file(traceback.format_exc(), LogType.INFO)
def crawl_main_page(self, keyword): try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until( ec.presence_of_element_located( (By.XPATH, '//input[@value="百度一下"]'))) except: CustomLogging.log_to_file('百度搜索打开失败', LogType.ERROR) # 高级设置 self.driver.find_element_by_name('q2').send_keys(keyword) self.driver.find_element_by_name('q6').send_keys(self.site) self.driver.find_element_by_name('rn').click() self.driver.find_element_by_xpath( '//select/option[@value="50"]').click() self.driver.find_element_by_xpath('//input[@value="百度一下"]').click() return self.crawl_search_results()
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_all_elements_located( (By.XPATH, '//div[@class="content"]//div'))) except TimeoutException: CustomLogging.log_to_file('搜索结果出错', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath( '//div[@class="content"]//div[@class="news"]') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_id( 'essaypubtime').text item.publish_date = re.search( re.compile( '[1-9]\d{3}.(0[1-9]|1[0-2]).(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d:[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'bjgsj'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_id( 'essaycontent').text item.title = each_article.find_element_by_id( 'essaytitlelinks').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_xpath( './/li[@id="essaytitlelinks"]/a').get_attribute("href") threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_class_name('next-page') self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_all_elements_located( (By.CLASS_NAME, 'r-info'))) except TimeoutException: CustomLogging.log_to_file('搜索结果为空', LogType.ERROR) result_articles = self.driver.find_elements_by_class_name('r-info') for each_article in result_articles: item = Entity() try: pub_date = each_article.find_element_by_class_name( 'fgray_time').text except NoSuchElementException: continue item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d:[0-5]\d' ), pub_date).group() # 判断文章是否在指定年限内,如果不在指定年限则退出 if not in_date_range(conv_pub_date(item.publish_date, 'sina'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'content').text item.title = each_article.find_element_by_tag_name('h2').text # 关键字过滤,如果关键在在文章标题和简述里都没出现,则判断下一条 if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_xpath( './/h2/a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item, )).start() # 跳出while循环 if exit_flag == 1: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagebox"]/a[@title="下一页"]') # next_page.click() self.driver.get(next_page.get_attribute('href')) # time.sleep(2) except NoSuchElementException: # print('已经是最后一页') break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_class_name( 'g').text item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'cnstock'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'des').text item.title = each_article.find_element_by_tag_name( 'a').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located((By.CLASS_NAME, 'results'))) except TimeoutException: CustomLogging.log_to_file('每经网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath( '//div[@class="results"]/div[@class="rb" or @class="vrwrap"]' ) for each_article in result_articles[1:]: item = Entity() item.publish_date = format_sougou_date( each_article.find_element_by_xpath( './/cite[contains(@id,"cacheresult_info_")]').text) if each_article.get_attribute('class') == 'rb': try: article_cont = each_article.find_element_by_xpath( './/div[contains(@id, "cacheresult_summary_")]' ) except NoSuchElementException: continue short_description = article_cont.text item.short_description = re.sub( re.compile( '[1-9]\d{3}年(0?[1-9]|1[0-2])月(0?[1-9]|[1-2][0-9]|3[0-1])日\s+-' ), '', short_description) item.title = each_article.find_element_by_xpath( './/a[contains(@id, "uigs_")]').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.publish_date == '': try: publish_date = each_article.find_element_by_xpath( './/div[contains(@id, "cacheresult_summary_")]/span' ).text item.publish_date = publish_date.replace( '年', '-').replace('月', '-').replace( '日', '').replace('-', '') except NoSuchElementException: continue else: item.title = each_article.find_element_by_class_name( 'vrTitle').text try: short_description = each_article.find_element_by_class_name( 'str_info').text except NoSuchElementException: continue item.short_description = re.sub( re.compile( '[1-9]\d{3}年(0?[1-9]|1[0-2])月(0?[1-9]|[1-2][0-9]|3[0-1])日\s+-' ), '', short_description) if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.publish_date == '': try: publish_date = each_article.find_element_by_class_name( 'gray-color').text item.publish_date = publish_date.replace( '-', '').replace('年', '-').replace( '月', '-').replace('日', '') except NoSuchElementException: continue if not in_date_range( conv_pub_date(item.publish_date, 'sougou'), self.year_range): exit_flag = 1 break if item.title in self.titles: continue else: self.titles.append(item.title) url = each_article.find_element_by_xpath( './/a[contains(@id, "sogou_snapshot_")]' ).get_attribute('href') item.url = urllib.parse.unquote( url.split('&')[1].replace('url=', '')) threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except TimeoutException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_id('sogou_next') # self.driver.get(next_page.get_attribute('href')) next_page.click() time.sleep(2) except TimeoutException: self.driver.execute_script('window.stop();') except NoSuchElementException: break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() # 和讯文章 try: wz_btn = self.driver.find_element_by_xpath( '//div[@class="searchRe-top-b"]/a[contains(text(), "文章")]') wz_btn.click() while True: try: result_articles = self.driver.find_elements_by_xpath( '//table[@class="stocktab mt6"]//tr') for each_article in result_articles[1:]: item = Entity() item.publish_date = each_article.find_elements_by_tag_name( 'td')[3].text if not in_date_range( conv_pub_date(item.publish_date, 'hexun'), self.year_range): continue item.title = each_article.find_elements_by_tag_name( 'td')[1].text item.short_description = each_article.find_elements_by_tag_name( 'td')[2].text if self.keyword not in item.short_description and self.keyword not in item.title: continue item.url = each_article.find_elements_by_tag_name( 'td')[1].find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item, )).start() except NoSuchElementException: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]' ) next_page_class = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]/..' ).get_attribute('class') if next_page_class == 'no_next': break next_page.click() time.sleep(2) except: break except NoSuchElementException: pass # 和讯新闻 news_btn = self.driver.find_element_by_xpath( '//div[@id="headLayer"]/a[contains(text(), "新闻")]') news_btn.click() time.sleep(1) while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'searchResult'))) except TimeoutException: CustomLogging.log_to_file('和讯财经新闻搜索结果加载失败', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'newslist-a') for each_article in result_articles: item = Entity() item.publish_date = \ each_article.find_element_by_class_name('news-l-t').find_elements_by_tag_name('span')[-1].text if not in_date_range( conv_pub_date(item.publish_date, 'hexun_news'), self.year_range): continue item.title = each_article.find_element_by_xpath( './/span[@class="breakdiv"]/a').text item.short_description = each_article.find_element_by_class_name( 'news-l-c').text if self.keyword not in item.short_description and self.keyword not in item.title: continue item.url = each_article.find_element_by_xpath( './/span[@class="breakdiv"]/a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item, )).start() except NoSuchElementException: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]') next_page_class = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]/..' ).get_attribute('class') if next_page_class == 'no_next': break next_page.click() time.sleep(2) except: break # 和讯博客 news_btn = self.driver.find_element_by_xpath( '//div[@class="search-rs-list-ty"]/a[contains(text(), "博客")]') news_btn.click() self.driver.find_element_by_id('s1_t').click() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'searchResult'))) except TimeoutException: CustomLogging.log_to_file('和讯财经博客搜索结果加载失败', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'newslist-a') for each_article in result_articles: item = Entity() item.publish_date = \ each_article.find_element_by_class_name('news-l-t').find_elements_by_tag_name('span')[ -1].text if not in_date_range( conv_pub_date(item.publish_date, 'hexun_blog'), self.year_range): exit_flag = 1 break item.title = each_article.find_element_by_xpath( './/span[@class="breakdiv"]/a').text item.short_description = each_article.find_element_by_class_name( 'news-l-c').text if self.keyword not in item.short_description and self.keyword not in item.title: continue item.url = each_article.find_element_by_xpath( './/span[@class="breakdiv"]/a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item, )).start() except NoSuchElementException: break if exit_flag == 1: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]') next_page_class = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]/..' ).get_attribute('class') if next_page_class == 'no_next': break next_page.click() time.sleep(2) except: break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() while True: try: self.wait.until( ec.presence_of_element_located((By.ID, 'container'))) except TimeoutException: CustomLogging.log_to_file('百度搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath( '//div[@class="result c-container "]') for each_article in result_articles: item = Entity() try: item.publish_date = each_article.find_element_by_xpath( './/span[contains(@class,"newTimeFactor_before_abs")]' ).text.replace('-', '') except NoSuchElementException: continue try: article_cont = each_article.find_element_by_class_name( 'c-abstract') except NoSuchElementException: continue short_description = article_cont.text item.short_description = re.sub( re.compile( '[1-9]\d{3}年(0?[1-9]|1[0-2])月(0?[1-9]|[1-2][0-9]|3[0-1])日\s+-' ), '', short_description) item.title = each_article.find_element_by_class_name( 't').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if not in_date_range( conv_pub_date(item.publish_date, 'baidu'), self.year_range): continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_xpath( './/h3[@class="t"]//a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() except TimeoutException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_id( 'page').find_element_by_class_name('n') # self.driver.get(next_page.get_attribute('href')) next_page.click() time.sleep(2) except TimeoutException: self.driver.execute_script('window.stop();') except NoSuchElementException: break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() try: self.wait.until(ec.presence_of_element_located( (By.XPATH, '//div[@id="search_result"]//a[contains(text(), "按更新时间排序")]'))) self.driver.find_element_by_xpath('//div[@id="search_result"]//a[contains(text(), "按更新时间排序")]').click() except: CustomLogging.log_to_file('证券时报搜索结果页打开失败', LogType.ERROR) exit_flag = 0 page_num = 1 while True: # 搜索结果只会显示100页 if page_num == 100: break try: self.wait.until(ec.presence_of_element_located((By.ID, 'search_list'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath('//div[@id="search_list"]//dl') for each_article in result_articles: item = Entity() item.publish_date = each_article.find_elements_by_tag_name('dd')[1].find_element_by_tag_name( 'span').text # 判断文章是否在指定年限内,如果不在指定年限则退出 if not in_date_range(conv_pub_date(item.publish_date, 'STCN'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_elements_by_tag_name('dd')[0].text item.title = each_article.find_element_by_tag_name('a').text # 关键字过滤,如果关键在在文章标题和简述里都没出现,则判断下一条 if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name('a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item,)).start() # 跳出while循环 if exit_flag == 1: break except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页加载错误', LogType.ERROR) try: next_page = self.driver.find_element_by_class_name('next') next_page.click() page_num += 1 except NoSuchElementException: # print('已经是最后一页') break return search_results