def crawl_search_results(self): search_results = [] try: self.wait.until(ec.presence_of_element_located((By.ID, 'searchlist'))) except TimeoutException: CustomLogging.log_to_file('第一财经网搜索结果页错误', LogType.ERROR) exit_flag = 0 start_index = 0 while True: try: self.wait.until(ec.presence_of_all_elements_located((By.CLASS_NAME, 'f-db'))) except TimeoutException: CustomLogging.log_to_file('文章列表加载失败', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name('f-db') for each_article in result_articles[start_index:]: item = Entity() item.publish_date = \ each_article.find_element_by_class_name('author').find_elements_by_tag_name('span')[ -1].text if not in_date_range(conv_pub_date(item.publish_date, 'yicai'), self.year_range): exit_flag = 1 # 跳出for循环 break item.title = each_article.find_element_by_tag_name('h2').text item.short_description = each_article.find_element_by_tag_name('p').text if self.keyword not in item.title and self.keyword not in item.short_description: continue item.url = each_article.get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item,)).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.ERROR) pass try: # next_page = self.wait.until(ec.visibility_of_element_located( # (By.XPATH, '//button[@class="u-btn" and contains(text(), "加载更多内容")]'))) # next_page = self.driver.find_element_by_xpath('//button[@class="u-btn" and contains(text(), "加载更多内容")]') # next_page.click() self.driver.execute_script('document.getElementsByClassName("u-btn")[0].click()') time.sleep(2) start_index += 20 except TimeoutException: CustomLogging.log_to_file('全部页面加载完成', LogType.INFO) break return search_results
def crawl_search_results(self): exit_flag = 0 index = 0 while True: try: self.wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'fr'))) except TimeoutException: CustomLogging.log_to_file('人民网搜索结果页面加载失败', LogType.ERROR) CustomLogging.log_to_file(traceback.format_exc(), LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath('//div[@class="fr w800"]//ul') for each_article in result_articles: item = Entity() pub_date = each_article.find_elements_by_tag_name('li')[2].text item.publish_date = re.search(re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d:[0-5]\d'), pub_date).group() if not in_date_range(conv_pub_date(item.publish_date, 'peoplecn'), self.year_range): exit_flag = 1 # 跳出for循环 break item.title = each_article.find_element_by_tag_name('a').text item.short_description = each_article.find_elements_by_tag_name('li')[1].text if self.keyword not in item.short_description and self.keyword not in item.title: continue item.url = each_article.find_element_by_tag_name('a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item,)).start() except NoSuchElementException: break if exit_flag == 1: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="show_nav_bar"]//a[contains(text(), "下一页")]') next_page.click() time.sleep(2) except NoSuchElementException: break
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_all_elements_located( (By.CLASS_NAME, 'r-info'))) except TimeoutException: CustomLogging.log_to_file('搜索结果为空', LogType.ERROR) result_articles = self.driver.find_elements_by_class_name('r-info') for each_article in result_articles: item = Entity() try: pub_date = each_article.find_element_by_class_name( 'fgray_time').text except NoSuchElementException: continue item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d:[0-5]\d' ), pub_date).group() # 判断文章是否在指定年限内,如果不在指定年限则退出 if not in_date_range(conv_pub_date(item.publish_date, 'sina'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'content').text item.title = each_article.find_element_by_tag_name('h2').text # 关键字过滤,如果关键在在文章标题和简述里都没出现,则判断下一条 if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_xpath( './/h2/a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item, )).start() # 跳出while循环 if exit_flag == 1: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagebox"]/a[@title="下一页"]') # next_page.click() self.driver.get(next_page.get_attribute('href')) # time.sleep(2) except NoSuchElementException: # print('已经是最后一页') break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_class_name( 'g').text item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'cnstock'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'des').text item.title = each_article.find_element_by_tag_name( 'a').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() # 和讯文章 try: wz_btn = self.driver.find_element_by_xpath( '//div[@class="searchRe-top-b"]/a[contains(text(), "文章")]') wz_btn.click() while True: try: result_articles = self.driver.find_elements_by_xpath( '//table[@class="stocktab mt6"]//tr') for each_article in result_articles[1:]: item = Entity() item.publish_date = each_article.find_elements_by_tag_name( 'td')[3].text if not in_date_range( conv_pub_date(item.publish_date, 'hexun'), self.year_range): continue item.title = each_article.find_elements_by_tag_name( 'td')[1].text item.short_description = each_article.find_elements_by_tag_name( 'td')[2].text if self.keyword not in item.short_description and self.keyword not in item.title: continue item.url = each_article.find_elements_by_tag_name( 'td')[1].find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item, )).start() except NoSuchElementException: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]' ) next_page_class = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]/..' ).get_attribute('class') if next_page_class == 'no_next': break next_page.click() time.sleep(2) except: break except NoSuchElementException: pass # 和讯新闻 news_btn = self.driver.find_element_by_xpath( '//div[@id="headLayer"]/a[contains(text(), "新闻")]') news_btn.click() time.sleep(1) while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'searchResult'))) except TimeoutException: CustomLogging.log_to_file('和讯财经新闻搜索结果加载失败', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'newslist-a') for each_article in result_articles: item = Entity() item.publish_date = \ each_article.find_element_by_class_name('news-l-t').find_elements_by_tag_name('span')[-1].text if not in_date_range( conv_pub_date(item.publish_date, 'hexun_news'), self.year_range): continue item.title = each_article.find_element_by_xpath( './/span[@class="breakdiv"]/a').text item.short_description = each_article.find_element_by_class_name( 'news-l-c').text if self.keyword not in item.short_description and self.keyword not in item.title: continue item.url = each_article.find_element_by_xpath( './/span[@class="breakdiv"]/a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item, )).start() except NoSuchElementException: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]') next_page_class = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]/..' ).get_attribute('class') if next_page_class == 'no_next': break next_page.click() time.sleep(2) except: break # 和讯博客 news_btn = self.driver.find_element_by_xpath( '//div[@class="search-rs-list-ty"]/a[contains(text(), "博客")]') news_btn.click() self.driver.find_element_by_id('s1_t').click() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'searchResult'))) except TimeoutException: CustomLogging.log_to_file('和讯财经博客搜索结果加载失败', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'newslist-a') for each_article in result_articles: item = Entity() item.publish_date = \ each_article.find_element_by_class_name('news-l-t').find_elements_by_tag_name('span')[ -1].text if not in_date_range( conv_pub_date(item.publish_date, 'hexun_blog'), self.year_range): exit_flag = 1 break item.title = each_article.find_element_by_xpath( './/span[@class="breakdiv"]/a').text item.short_description = each_article.find_element_by_class_name( 'news-l-c').text if self.keyword not in item.short_description and self.keyword not in item.title: continue item.url = each_article.find_element_by_xpath( './/span[@class="breakdiv"]/a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item, )).start() except NoSuchElementException: break if exit_flag == 1: break try: next_page = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]') next_page_class = self.driver.find_element_by_xpath( '//div[@class="hx_paging"]//a[contains(text(), "下一页")]/..' ).get_attribute('class') if next_page_class == 'no_next': break next_page.click() time.sleep(2) except: break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() try: self.wait.until(ec.presence_of_element_located( (By.XPATH, '//div[@id="search_result"]//a[contains(text(), "按更新时间排序")]'))) self.driver.find_element_by_xpath('//div[@id="search_result"]//a[contains(text(), "按更新时间排序")]').click() except: CustomLogging.log_to_file('证券时报搜索结果页打开失败', LogType.ERROR) exit_flag = 0 page_num = 1 while True: # 搜索结果只会显示100页 if page_num == 100: break try: self.wait.until(ec.presence_of_element_located((By.ID, 'search_list'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath('//div[@id="search_list"]//dl') for each_article in result_articles: item = Entity() item.publish_date = each_article.find_elements_by_tag_name('dd')[1].find_element_by_tag_name( 'span').text # 判断文章是否在指定年限内,如果不在指定年限则退出 if not in_date_range(conv_pub_date(item.publish_date, 'STCN'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_elements_by_tag_name('dd')[0].text item.title = each_article.find_element_by_tag_name('a').text # 关键字过滤,如果关键在在文章标题和简述里都没出现,则判断下一条 if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name('a').get_attribute('href') threading.Thread(target=self.download_and_save_item, args=(item,)).start() # 跳出while循环 if exit_flag == 1: break except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页加载错误', LogType.ERROR) try: next_page = self.driver.find_element_by_class_name('next') next_page.click() page_num += 1 except NoSuchElementException: # print('已经是最后一页') break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() while True: try: self.wait.until( ec.presence_of_element_located((By.ID, 'container'))) except TimeoutException: CustomLogging.log_to_file('百度搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath( '//div[@class="result c-container "]') for each_article in result_articles: item = Entity() try: item.publish_date = each_article.find_element_by_xpath( './/span[contains(@class,"newTimeFactor_before_abs")]' ).text.replace('-', '') except NoSuchElementException: continue try: article_cont = each_article.find_element_by_class_name( 'c-abstract') except NoSuchElementException: continue short_description = article_cont.text item.short_description = re.sub( re.compile( '[1-9]\d{3}年(0?[1-9]|1[0-2])月(0?[1-9]|[1-2][0-9]|3[0-1])日\s+-' ), '', short_description) item.title = each_article.find_element_by_class_name( 't').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if not in_date_range( conv_pub_date(item.publish_date, 'baidu'), self.year_range): continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_xpath( './/h3[@class="t"]//a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() except TimeoutException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_id( 'page').find_element_by_class_name('n') # self.driver.get(next_page.get_attribute('href')) next_page.click() time.sleep(2) except TimeoutException: self.driver.execute_script('window.stop();') except NoSuchElementException: break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located((By.CLASS_NAME, 'results'))) except TimeoutException: CustomLogging.log_to_file('每经网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath( '//div[@class="results"]/div[@class="rb" or @class="vrwrap"]' ) for each_article in result_articles[1:]: item = Entity() item.publish_date = format_sougou_date( each_article.find_element_by_xpath( './/cite[contains(@id,"cacheresult_info_")]').text) if each_article.get_attribute('class') == 'rb': try: article_cont = each_article.find_element_by_xpath( './/div[contains(@id, "cacheresult_summary_")]' ) except NoSuchElementException: continue short_description = article_cont.text item.short_description = re.sub( re.compile( '[1-9]\d{3}年(0?[1-9]|1[0-2])月(0?[1-9]|[1-2][0-9]|3[0-1])日\s+-' ), '', short_description) item.title = each_article.find_element_by_xpath( './/a[contains(@id, "uigs_")]').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.publish_date == '': try: publish_date = each_article.find_element_by_xpath( './/div[contains(@id, "cacheresult_summary_")]/span' ).text item.publish_date = publish_date.replace( '年', '-').replace('月', '-').replace( '日', '').replace('-', '') except NoSuchElementException: continue else: item.title = each_article.find_element_by_class_name( 'vrTitle').text try: short_description = each_article.find_element_by_class_name( 'str_info').text except NoSuchElementException: continue item.short_description = re.sub( re.compile( '[1-9]\d{3}年(0?[1-9]|1[0-2])月(0?[1-9]|[1-2][0-9]|3[0-1])日\s+-' ), '', short_description) if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.publish_date == '': try: publish_date = each_article.find_element_by_class_name( 'gray-color').text item.publish_date = publish_date.replace( '-', '').replace('年', '-').replace( '月', '-').replace('日', '') except NoSuchElementException: continue if not in_date_range( conv_pub_date(item.publish_date, 'sougou'), self.year_range): exit_flag = 1 break if item.title in self.titles: continue else: self.titles.append(item.title) url = each_article.find_element_by_xpath( './/a[contains(@id, "sogou_snapshot_")]' ).get_attribute('href') item.url = urllib.parse.unquote( url.split('&')[1].replace('url=', '')) threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except TimeoutException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_id('sogou_next') # self.driver.get(next_page.get_attribute('href')) next_page.click() time.sleep(2) except TimeoutException: self.driver.execute_script('window.stop();') except NoSuchElementException: break return search_results
def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_all_elements_located( (By.XPATH, '//div[@class="content"]//div'))) except TimeoutException: CustomLogging.log_to_file('搜索结果出错', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath( '//div[@class="content"]//div[@class="news"]') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_id( 'essaypubtime').text item.publish_date = re.search( re.compile( '[1-9]\d{3}.(0[1-9]|1[0-2]).(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d:[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'bjgsj'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_id( 'essaycontent').text item.title = each_article.find_element_by_id( 'essaytitlelinks').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_xpath( './/li[@id="essaytitlelinks"]/a').get_attribute("href") threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_class_name('next-page') self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results
def crawl_search_results(self): search_results = [] self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'search-text'))) except TimeoutException: CustomLogging.log_to_file('每经网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_xpath( '//ul[@class="search-text mt15"]/li') for each_article in result_articles: item = Entity() item.publish_date = each_article.find_element_by_class_name( 'articleMaterial_meta').text if not in_date_range( conv_pub_date(item.publish_date, 'mrjj'), self.year_range): exit_flag = 1 # 跳出for循环 break try: item.short_description = each_article.find_element_by_class_name( 'articleMaterial_depict').text except NoSuchElementException: item.short_description = '' item.title = each_article.find_element_by_class_name( 'articleMaterial_title').text if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_class_name( 'articleMaterial_title').find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_class_name( 'next').find_element_by_tag_name('a') self.driver.get(next_page.get_attribute('href')) # next_page.click() # time.sleep(2) except TimeoutException: self.driver.execute_script('window.stop();') except NoSuchElementException: break return search_results