class SeleniumTestCase(LiveServerTestCase): def _pre_setup(self): super(SeleniumTestCase, self)._pre_setup() self.driver = PhantomJS() def _post_teardown(self): self.driver.quit() super(SeleniumTestCase, self)._post_teardown() def login(self, username='******', password='******', url='login'): """ Login to the server and be authenticated """ self.open(reverse(url)) self.driver.find_element_by_id("id_username").clear() self.driver.find_element_by_id("id_username").send_keys(username) self.driver.find_element_by_id("id_password").clear() self.driver.find_element_by_id("id_password").send_keys(password) self.driver.find_element_by_id("submit-id-login").click() def open(self, url): self.driver.get("%s%s" %(self.live_server_url, url)) def is_element_present(self, how, what): try: self.driver.find_element(by=how, value=what) except NoSuchElementException, e: return False return True
def onegoogolePR(self, url): '''返回单个PR''' prUrl = 'http://pr.chinaz.com' # 谷歌PR查询地址 driver = PhantomJS() driver.get(prUrl) driver.find_element_by_id('PRAddress').send_keys(url) driver.find_element_by_class_name('search-write-btn').click() try: imgsrc = driver.find_element_by_css_selector('span#pr>img').get_attribute('src') pr = search(r'\d', imgsrc).group() except: pr = '暂无数据' driver.quit() return pr
def check_agree(link, soup): # Agree if asked to (click on accept) if soup.find('input', {'id': 'ctl00_mainContentArea_disclaimerContent_yesButton'}): print("Agreeing the terms of use - please wait...") driver = PhantomJS('.\phantomjs.exe' if platform. startswith('win32') else './phantomjs') driver.get(link) driver.find_element_by_id( 'ctl00_mainContentArea_disclaimerContent_yesButton').click() for cookie in driver.get_cookies(): s.cookies.set(cookie['name'], cookie['value']) driver.quit() resp_inner = s.get(link) soup = Soup(resp_inner.text, features="lxml") print("Done, now let's get back to the scraping process.") return soup
for s in school_info: inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML')) inner_html = sub(r'\s+', ' ', inner_html).strip() if 'grades' in inner_html.lower(): min_grade, max_grade = inner_html.split(' ')[-1].split('-') if min_grade.lower() == 'pk': min_grade = -1 elif min_grade.lower() == 'k': min_grade = 0 n_grades = int(max_grade) - int(min_grade) + 1 elif 'students' in inner_html.lower(): n_students = int(sub(r'[^0-9]', '', inner_html.split(' ')[-1])) students_per_grade = float(n_students) / float(n_grades) staff_info = wd.find_element_by_id( 'TeachersStaff').find_elements_by_class_name( 'rating-container__score-item') teacher_info = sub(r'<.*?>|\n', ' ', staff_info[0].get_attribute('innerHTML')) teacher_info = sub(r'\s+', ' ', teacher_info).strip() counsel_info = sub(r'<.*?>|\n', ' ', staff_info[1].get_attribute('innerHTML')) counsel_info = sub(r'\s+', ' ', counsel_info).strip() t_to_s_school = int(sub(r'.*?(\d+) :1.*', r'\1', teacher_info)) c_to_s_school = int(sub(r'.*?(\d+) :1.*', r'\1', counsel_info)) columns = [ 'indicator', 'subject', 'category', 'school_score', 'state_average' ] df = DataFrame(columns=columns) ind = 0
class CNStock(SentimentCrawler): def __init__(self): super().__init__(init=False) self.driver = PhantomJS() self.driver.maximize_window() self.wait = WebDriverWait(self.driver, 15) self.url = 'http://www.cnstock.com/' self.name = '中国证券网' def crawl_main_page(self, keyword): self.driver.set_page_load_timeout(10) try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until( ec.presence_of_element_located((By.ID, 'nav_keywords'))) except: CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR) self.driver.find_element_by_id('nav_keywords').clear() self.driver.find_element_by_id('nav_keywords').send_keys(keyword + Keys.ENTER) return self.crawl_search_results() def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_class_name( 'g').text item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'cnstock'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'des').text item.title = each_article.find_element_by_tag_name( 'a').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) pass
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
from selenium.webdriver import PhantomJS from selenium.webdriver.common.keys import Keys from time import sleep driver = PhantomJS() driver.set_window_size(1120, 550) driver.get("https://duckduckgo.com/") driver.find_element_by_id('search_form_input_homepage').send_keys("realpython") driver.find_element_by_id("search_button_homepage").click() print(driver.current_url) driver.quit()
driver = PhantomJS( 'D:/vscodeproject/chapter16爬虫/install/phantomjs-2.1.1-windows/bin/phantomjs.exe' ) driver.set_window_size(1280, 1080) def savepic(): basepath = 'D:/vscodeproject/python/search-generates-wordcloud/out/' filename = basepath + "{:%Y%m%d%H%M%S}-{:03}.png".format( datetime.datetime.now(), random.randint(1, 100)) driver.save_screenshot(filename) url = 'http://www.baidu.com' driver.get(url) logger.flow(driver.current_url, '1') savepic() select = driver.find_element_by_id('kw') select.send_keys('哥斯拉2 百度云下载') logger.flow(driver.current_url, '2') select.send_keys(Keys.ENTER) time.sleep(4) logger.flow(driver.current_url, '3') savepic() driver.quit()
class CamaraCGCrawler(object): """ Camara CG Ementa Crawler """ def __init__(self, starting_year): self.base_url = "http://187.115.174.90:8080/ScanLexWeb" self.starting_year = starting_year self.browser = None @staticmethod def get_ementa_id(published_date, ementa_type, ementa_doc_number, ementa_situation): """ Return the Ementa Unique Id """ return "%s#%s#%s#%s" % (datetime.strftime( published_date, "%Y-%m-%d"), ementa_type, ementa_doc_number, ementa_situation) def get_all_ementas_summary(self): """ Yield the next ementa information row """ browser_table = self.browser.find_element_by_id( "frmMenu:tabEmentas_data") bs_ementa_table = BeautifulSoup( browser_table.get_attribute("innerHTML")) for row in bs_ementa_table.find_all("tr"): cols = row.find_all("td") if len(cols) == 6: published_date = datetime.strptime( cols[0].span.text.encode("utf-8"), "%d/%m/%Y") doc_number = int(cols[1].span.text.encode("utf-8")) title = cols[2].span.text.encode("utf-8") ementa_type = cols[3].span.text.encode("utf-8") ementa_situation = cols[4].span.text.encode("utf-8") details_js = cols[5].a['onclick'].encode("utf-8") if published_date > datetime.now(): continue yield published_date, doc_number, title, ementa_type, ementa_situation, details_js def get_ementa_details(self, ementa_details_js): """ Crawl the second ementa page """ # Waiting... _ = WebDriverWait(self.browser, 30).until( EC.visibility_of_element_located( (By.ID, "frmfuncao:j_idt13_content"))) _ = WebDriverWait(self.browser, 30).until( EC.visibility_of_element_located( (By.ID, "frmfuncao:tabProponentes"))) # Get Ementail Details bs_ementa_details = BeautifulSoup(self.browser \ .find_element_by_id("frmfuncao:j_idt13_content").get_attribute("innerHTML")) rows = bs_ementa_details.find_all("tr") source = rows[3].td.text main_theme = rows[7].td.text sys_enter_date = datetime.strptime(rows[9].td.text, "%d/%m/%Y") approval_date = datetime.strptime(rows[11].td.text, "%d/%m/%Y") process_number = int(rows[15].td.text or "-1") autograph_number = int(rows[19].td.text or "-1") process_year = int(rows[21].td.text or "-1") has_image = rows[23].td.text == "Sim" # Get Proponent names bs_proponent = BeautifulSoup( self.browser.find_element_by_id( "frmfuncao:tabProponentes").get_attribute("innerHTML")) proponents = ",".join( [col.text for col in bs_proponent.find_all("td")]) return source, proponents, main_theme, sys_enter_date, approval_date, process_number, \ autograph_number, process_year, has_image def next_ementa(self, select_curs): """ Iterate in the years onwards and collect all the ementas """ try: LOGGER.info("Opening Browser") self.browser = PhantomJS() LOGGER.info("GET [%s]", self.base_url) self.browser.maximize_window() cur_year = int(datetime.now().year) # Define the initial collection year select_curs.execute( "SELECT EXTRACT (YEAR FROM MAX(published_date)) FROM ementas;") last_exec_year = select_curs.fetchone() if last_exec_year: collection_year = max(self.starting_year, last_exec_year[0]) else: collection_year = self.starting_year all_proponents = [ "ANDERSON MAIA", "Afonso Alexandre Régis", "Alcides Cavalcante", "Alcindor Villarim", "Aldo Cabral", "Alexandre do Sindicato", "Antonio Pereira", "Antônio Alves Pimentel Filho", "Aragão Júnior", "Bruno Cunha Lima Branco", "Bruno Gaudêncio", "Buchada", "Cassiano Pascoal", "Cozete Babosa", "Cássio Murilo Galdino de Araujo", "Daniella Ribeiro", "Dr. Nunes", "Executivo", "Fabrinni Brito", "Fernando carvalho", "Francisco Dantas Lira", "Galego do Leite", "Inacio Falcao", "Ivan Batista", "Ivonete Ludgerio", "Joao Dantas", "Josimar Henrique da Silva", "José Marcos Raia ", "José Ribamar", "João Dantas", "Jóia Germano", "Laelson Patricio", "Lafite", "Lindaci Medeiros Nápolis", "Lourdes Costa", "Lula Cabral", "Marcos Marinho", "Maria Lopes Barbosa", "Marinaldo Cardoso", "Metuselá Agra", "Miguel Rodrigues da Silva", "Miguel da Construção", "Napoleão Maracajá", "Nelson Gomes Filho", "Olimpio Oliveira", "Orlandino Farias", "Paulo Muniz", "Paulo de Tarso", "Peron Ribeiro Japiassú", "Renato Feliciano", "Rodolfo Rodrigues", "Rodrigo Ramos Victor", "Romero Rodrigues", "Rostand Paraíba", "Rômulo Gouveia", "Saulo Germano", "Saulo Noronha", "Tia Mila", "Tovar Correia Lima", "Vaninho Aragão", "Veneziano Vital do rego", "Walter Brito Neto", "Todos" ] while collection_year <= cur_year: for i_prop in range(len(all_proponents)): ementa_prop = all_proponents[i_prop].decode("utf-8") self.browser.get(self.base_url) # Waiting... WebDriverWait(self.browser, 30).until( EC.element_to_be_clickable((By.ID, "frmMenu:button1"))) LOGGER.info("Collecting Ementas from [%d][%s - %d/%d]", collection_year, ementa_prop, i_prop + 1, len(all_proponents)) # Set Year year_field = self.browser.find_element_by_id("frmMenu:ano") year_field.send_keys(collection_year) # Set Proponent proponent_field = self.browser.find_element_by_id( "frmMenu:autoridade") proponent_field.send_keys(ementa_prop) # Submit the form self.browser.find_element_by_id("frmMenu:button1").click() # Waiting... # _ = WebDriverWait(self.browser, 60).until(EC.visibility_of_element_located((By.ID, "frmMenu:tabEmentas_data"))) time.sleep(3) for published_date, document_number, title, ementa_type, ementa_situation, ementa_details_js in self.get_all_ementas_summary( ): ementa_id = self.get_ementa_id(published_date, ementa_type, document_number, ementa_situation) select_curs.execute(""" SELECT ementa_id FROM ementas WHERE ementa_id = '%s'; """ % ementa_id) if not select_curs.fetchone(): # Run the details script self.browser.execute_script(ementa_details_js) ementa_source, proponents, main_theme, sys_enter_date, approval_date, \ process_number, autograph_number, process_year, has_image = self.get_ementa_details(ementa_details_js) # Come back to the table page self.browser.back() # Waiting... _ = WebDriverWait(self.browser, 60).until( EC.visibility_of_element_located( (By.ID, "frmMenu:tabEmentas_data"))) yield ementa_id, published_date, ementa_type, document_number, title, \ ementa_source, proponents, ementa_situation, main_theme, sys_enter_date, \ approval_date, process_number, autograph_number, process_year, has_image LOGGER.info("DONE [%d]", collection_year) self.browser.back() collection_year += 1 finally: if self.browser: self.browser.quit()
school_info = wd.find_elements_by_class_name('school-info__item') for s in school_info: inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML')) inner_html = sub(r'\s+', ' ', inner_html).strip() if 'grades' in inner_html.lower(): min_grade, max_grade = inner_html.split(' ')[-1].split('-') if min_grade.lower() == 'pk': min_grade = -1 elif min_grade.lower() == 'k': min_grade = 0 n_grades = int(max_grade) - int(min_grade) + 1 elif 'students' in inner_html.lower(): n_students = int(sub(r'[^0-9]', '', inner_html.split(' ')[-1])) students_per_grade = float(n_students) / float(n_grades) staff_info = wd.find_element_by_id('TeachersStaff').find_elements_by_class_name('rating-container__score-item') teacher_info = sub(r'<.*?>|\n', ' ', staff_info[0].get_attribute('innerHTML')) teacher_info = sub(r'\s+', ' ', teacher_info).strip() counsel_info = sub(r'<.*?>|\n', ' ', staff_info[1].get_attribute('innerHTML')) counsel_info = sub(r'\s+', ' ', counsel_info).strip() t_to_s_school = int(sub(r'.*?(\d+) :1.*', r'\1', teacher_info)) c_to_s_school = int(sub(r'.*?(\d+) :1.*', r'\1', counsel_info)) columns = ['indicator', 'subject', 'category', 'school_score', 'state_average'] df = DataFrame(columns=columns) ind = 0 for indicator in indicators: elem = wd.find_element_by_id(indicator) buttons = elem.find_elements_by_class_name('sub-nav-item') for b in buttons:
class WeixinPhantomjs(Base): all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.__class__.all_uids: self.__class__.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl_single(self, word=None, go=0): is_go = True go_page = int(go) next_page_css = 'sogou_page_%s' is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18) wt = randint(1, 5) self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) self.close_browser() @classmethod def crawl_with_threads(cls): pool = ThreadPool(4) total_words = QueryWords().get_query_words() for bulk_words in total_words: try: pool.map(lambda w: cls().crawl_single(w), bulk_words) except Exception as e: cls.logger.info('Threads crawl error: type <{}>, msg <{}>'.format(e.__class__, e)) pool.close() pool.join() in_client.close() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
class WeixinPhantomjs(Base): def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() self.client = MongoClient(HOST, PORT) self.collection = self.client[DB][COLLECTION] self.all_uids = self.uids def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def get_query_words(self, word): query_words = [] for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]): w = docs['conp'] if w not in query_words: query_words.append(w) for item in docs['rel']: if item not in query_words: query_words.append(item) self.client.close() return self.query_index(query_words, word) @property def uids(self): return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.all_uids: self.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @staticmethod def query_index(words, cut_word): temp_words = words[START_INDEX:END_INDEX] try: index = temp_words.index(cut_word) return temp_words[index:], index + START_INDEX except ValueError: pass return temp_words, START_INDEX @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl(self, word=None, go=0): is_go = True is_break = False go_page = int(go) next_page_css = 'sogou_page_%s' query_words, ind = self.get_query_words(word) for index, word in enumerate(query_words, 1): next_ind = ind + index is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() wt = randint(10, 40) if page % 3 == 0 else randint(5, 18) self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) if is_break: break in_client.close() self.close_browser() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
class ProviderAdvancedViewTests(LiveServerTestCase): def setUp(self): self.driver = PhantomJS() self.user = User.objects.create_user('admin', '*****@*****.**', 'password') self.user.save() self.provider = Provider( name='provider', user=self.user, ) self.provider.save() self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider) self.login() def tearDown(self): self.driver.quit() def open(self, url): self.driver.get("%s%s" % (self.live_server_url, url)) def login(self): self.open(settings.LOGIN_URL) self.driver.find_element_by_id("id_username").send_keys("admin") self.driver.find_element_by_id("id_password").send_keys("password") self.driver.find_element_by_css_selector("button.btn.btn-default").click() self.assertEqual( self.driver.current_url, self.live_server_url + reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]), ) def test_can_login(self): """ Test that the user can login """ pass def test_provider_page_has_all_data(self): """ Test that the provider statistics page has all the correct data """ self.open(reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk])) self.assertEqual("Open Ads", self.driver.title) self.assertIn( "{0} advertisements".format(self.provider.name), self.driver.find_element_by_css_selector("h1.page-header").text ) self.assertIn( "{0} advertisements in rotation".format(20), self.driver.find_element_by_css_selector("h1.page-header").text ) def test_advertisement_page_has_all_data(self): """ Test that the advertisement page has all the correct data """ for advert in self.provider_adverts: self.open(reverse('advertisements.views.view_advert_statistics', args=[advert.pk])) self.assertIn( "ID number: {0}".format(advert.pk), self.driver.find_element_by_css_selector("h1.page-header").text, ) self.driver.find_element_by_css_selector("img") self.assertEqual("Active", self.driver.find_element_by_xpath("//td[2]/span").text) self.assertEqual(advert.url, self.driver.find_element_by_link_text(advert.url).text) self.driver.find_element_by_link_text("Edit URL").click() self.assertEqual(advert.url, self.driver.find_element_by_id("id_url").get_attribute("value"))
# 将他拷贝到本地文件 w 写 b 二进制 wb代表写入二进制文本 path = Path('./pic') path.mkdir(exist_ok=True) path = path / url_info[1] with open(path, 'wb') as f: f.write(img) driver = PhantomJS() # 创建Chrome对象. # 操作这个对象. driver.get('http://zxgk.court.gov.cn/shixin/') # get方式访问百度. i = 1 j = 10000 while j >= 0: a = driver.find_element_by_id("captchaImg") url = (a.get_attribute('src')) pic_name = f"{i}.png" try: download_img([url, pic_name]) except Exception as e: print(e) continue print(f"{pic_name}已经下载成功,共成功下载{i}张验证码") i += 1 j -= 1 ActionChains(driver).move_to_element(a).click().perform() time.sleep(2) # 防止过快被封ip driver.quit() # 使用完, 记得关闭浏览器, 不然chromedriver.exe进程为一直在内存中.
def __init__(self, browser: webdriver.PhantomJS, click_to_display_id: str): self._browser = browser self.click_to_display_id = click_to_display_id self.click_to_display_element = browser.find_element_by_id(click_to_display_id)
from selenium.webdriver import PhantomJS as Browser import json import time import re proxy_list_url = "http://spys.one/socks/" proxies = [] br = Browser() br.get(proxy_list_url) sizes = [25, 50, 100, 200, 300, 500] pattern = re.compile(r"[.\s]+\((\d+)\)") for country_id in range(1, 171): try_counter = 0 count = 0 while (elm := br.find_element_by_id('tldc')).find_element_by_xpath( f"./option[@selected]").get_attribute("value") != str(country_id): elm = elm.find_element_by_xpath(f'./option[@value="{country_id}"]') elm.click() try_counter += 1 if try_counter >= 2: break if try_counter >= 2: continue count = int(pattern.findall(elm.text)[0]) key = 0 for key, size in enumerate(sizes): if int(size) > count: break try_counter = 0 while (elm := br.find_element_by_id("xpp")).find_element_by_xpath( "./option[@selected]").get_attribute("value") != str(key):
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url