count = int(pattern.findall(elm.text)[0]) key = 0 for key, size in enumerate(sizes): if int(size) > count: break try_counter = 0 while (elm := br.find_element_by_id("xpp")).find_element_by_xpath( "./option[@selected]").get_attribute("value") != str(key): elm = elm.find_element_by_xpath(f'./option[@value="{key}"]') elm.click() try_counter += 1 if try_counter >= 5: break if try_counter >= 5: continue elms = br.find_elements_by_class_name( "spy1xx")[1:] + br.find_elements_by_class_name("spy1x")[1:] i = 0 start = time.time() for elm in elms: tds = elm.find_elements_by_tag_name("td") proxies.append( dict(proxie_url=tds[0].text, type=tds[1].text, delay=float(tds[5].text))) i += 1 if i % 50 == 0: stop = time.time() print(stop - start) start = stop proxies.sort(key=lambda x: x['delay']) with open("proxies.json", "w") as f:
wd = PhantomJS() output_cols = [ 'school', 'url', 'students_per_grade', 'teachers_to_student', 'counselors_to_student', 'reading', 'math', 'science' ] output_df = DataFrame(columns=output_cols) output_ind = 0 for url in urls: t1 = time() wd.get(url) school_name = wd.title.split(' -')[0] print school_name, school_info = wd.find_elements_by_class_name('school-info__item') for s in school_info: inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML')) inner_html = sub(r'\s+', ' ', inner_html).strip() if 'grades' in inner_html.lower(): min_grade, max_grade = inner_html.split(' ')[-1].split('-') if min_grade.lower() == 'pk': min_grade = -1 elif min_grade.lower() == 'k': min_grade = 0 n_grades = int(max_grade) - int(min_grade) + 1 elif 'students' in inner_html.lower(): n_students = int(sub(r'[^0-9]', '', inner_html.split(' ')[-1])) students_per_grade = float(n_students) / float(n_grades) staff_info = wd.find_element_by_id(
class CNStock(SentimentCrawler): def __init__(self): super().__init__(init=False) self.driver = PhantomJS() self.driver.maximize_window() self.wait = WebDriverWait(self.driver, 15) self.url = 'http://www.cnstock.com/' self.name = '中国证券网' def crawl_main_page(self, keyword): self.driver.set_page_load_timeout(10) try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until( ec.presence_of_element_located((By.ID, 'nav_keywords'))) except: CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR) self.driver.find_element_by_id('nav_keywords').clear() self.driver.find_element_by_id('nav_keywords').send_keys(keyword + Keys.ENTER) return self.crawl_search_results() def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_class_name( 'g').text item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'cnstock'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'des').text item.title = each_article.find_element_by_tag_name( 'a').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) pass
def get_applications_in_page(self, scroll_script): applications = [] driver = None try: desired_capabilities = dict(DesiredCapabilities.PHANTOMJS) desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url) service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))] driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args) # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy) if self.proxy_test: driver.get('http://curlmyip.com/') ip = driver.find_element_by_xpath('//body//pre').text print('ip : [ ' + ip + ' ]') pass else: driver.get(self.url) driver.execute_script(scroll_script) acknowledge = 0 done = False while not done: scroll_finished = driver.execute_script("return scraperLoadCompleted") if scroll_finished: if acknowledge == self.acknowledgements: done = driver.execute_script("return scraperLoadCompleted") pass else: acknowledge += 1 pass pass else: acknowledge = 0 pass time.sleep(5) # Wait before retry pass product_matrix = driver.find_elements_by_class_name("card") for application in product_matrix: extracted_application = self.extract_application_data(application) # if extracted_application['app_price'] != -1: applications.append(extracted_application) #pass pass pass driver.quit() pass except Exception as e: if driver is not None: driver.quit() pass if self.attempt < self.retries: self.attempt += 1 time.sleep(10) print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]' applications = self.get_applications_in_page(scroll_script) pass else: print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]') pass pass return applications pass
wd = PhantomJS() output_cols = [ 'school', 'url', 'students_per_grade', 'teachers_to_student', 'counselors_to_student', 'reading', 'math', 'science' ] output_df = DataFrame(columns=output_cols) output_ind = 0 for url in urls: t1 = time() wd.get(url) school_name = wd.title.split(' -')[0] print school_name, school_info = wd.find_elements_by_class_name('school-info__item') for s in school_info: inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML')) inner_html = sub(r'\s+', ' ', inner_html).strip() if 'grades' in inner_html.lower(): min_grade, max_grade = inner_html.split(' ')[-1].split('-') if min_grade.lower() == 'pk': min_grade = -1 elif min_grade.lower() == 'k': min_grade = 0 n_grades = int(max_grade) - int(min_grade) + 1 elif 'students' in inner_html.lower(): n_students = int(sub(r'[^0-9]', '', inner_html.split(' ')[-1])) students_per_grade = float(n_students) / float(n_grades) staff_info = wd.find_element_by_id('TeachersStaff').find_elements_by_class_name('rating-container__score-item')
from selenium.webdriver import Firefox, PhantomJS driver = PhantomJS() url = ('https://www.google.com/finance?start=0&num=5000&q=%5B(exchange%20%3D' '%3D%20"{}")%20%26%20(last_price%20>%200.1)%20%26%20(last_price%20<' '%201500)%5D&restype=company&noIL=1') driver.get(url.format('NYSE')) nyse = (elem.text for elem in driver.find_elements_by_class_name('symbol')) driver.get('https://www.google.com/finance?q=NYSE%3A{}'.format(list(nyse)[0])) print driver.find_element_by_class_name('pr').text # driver.get(url.format('NASDAQ')) # nasdaq = (elem.text for elem in driver.find_elements_by_class_name('symbol')) # print '\n'.join(list(nasdaq))