class KeywordTool(object): sources = {'google', 'youtube', 'bing', 'amazon', 'ebay', 'app-store'} def __init__(self, source='google', timeout=5): self.source = source self.base_url = None self.timeout = timeout self.driver = PhantomJS() self.driver.get(self.base_url) def search(self, search_term): if self.current_url != self.base_url: self.source = self.source # forces page load self.driver.find_element_by_xpath( '//input[@id="edit-keyword"]').send_keys(search_term) self.driver.find_element_by_xpath( '//button[@id="edit-submit"]').click() """Wait for at least one element to load. In practice, most of them load. You can't get them all without scrolling.""" element_not_present = EC.invisibility_of_element_located( (By.XPATH, '//td[@class="col-keywords"]//div')) WebDriverWait(self.driver, self.timeout).until(element_not_present) def parse(self): tree = html.fromstring(self.driver.page_source) L = tree.xpath('//td[@class="col-keywords"]//text()') L = map(lambda s: s.strip(), ''.join(L).split('\n')) return [s for s in L if s] def get_keywords(self, search_term, source='google'): if self.source != source: self.source = source self.search(search_term) return self.parse() @property def source(self): return self._source @source.setter def source(self, val): self._source = val if val in self.sources else 'google' if 'driver' in self.__dict__: self.driver.get(self.base_url) @property def base_url(self): return ''.join(['https://keywordtool.io/', self.source]) @base_url.setter def base_url(self, val): pass @property def current_url(self): return self.driver.current_url @current_url.setter def current_url(self, val): pass
def start(n, comic_url): urllists.append(comic_url) driver = PhantomJS() driver.get(comic_url) get_images_url(n, driver, comic_url) while True: try: driver.find_element_by_xpath( "//li[@id='next_item']/a[@id='mainControlNext']").click() comic_url = driver.current_url if comic_url not in urllists: urllists.append(comic_url) get_images_url(n, driver, comic_url) driver.find_element_by_xpath( "//li[@id='next_item']/a[@id='mainControlNext']").click() # print n + '\t' + comic_url except: print 'All done!' break
class AdvertisementAdvancedViewTests(LiveServerTestCase): def setUp(self): self.driver = PhantomJS() self.user = User.objects.create_user('admin', '*****@*****.**', 'pass') self.user.save() self.provider = Provider( name='provider', user=self.user, ) self.provider.save() self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider) def tearDown(self): self.driver.quit() def open(self, url): self.driver.get("%s%s" % (self.live_server_url, url)) def test_side_ad_display(self): """ Test that the side ads display properly """ self.open(reverse('advertisements.views.side_ads')) self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 4) self.driver.find_element_by_xpath("//a[1]/img") self.driver.find_element_by_xpath("//a[2]/img") self.driver.find_element_by_xpath("//a[3]/img") self.driver.find_element_by_xpath("//a[4]/img") self.assertNotEqual(self.driver.find_element_by_xpath("//a[1]").get_attribute("href"), '') self.assertNotEqual(self.driver.find_element_by_xpath("//a[2]").get_attribute("href"), '') self.assertNotEqual(self.driver.find_element_by_xpath("//a[3]").get_attribute("href"), '') self.assertNotEqual(self.driver.find_element_by_xpath("//a[4]").get_attribute("href"), '') def test_top_ad_display(self): """ Test that the top ad displays properly """ self.open(reverse('advertisements.views.top_ad')) self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 1) self.driver.find_element_by_xpath("//a/img") self.assertNotEqual(self.driver.find_element_by_xpath("//a").get_attribute("href"), '')
class Premiumgeneratorlink(object): def __init__(self, url): self.url = url self.browser = PhantomJS() def get_link(self): try: self.browser.get('http://premiumgeneratorlink.com/') self.browser.find_element_by_name('link').send_keys(self.url) self.browser.find_element_by_xpath('//a[@class="input"]').click() wdw = WebDriverWait(self.browser, 10) wdw.until(EC.element_to_be_clickable((By.ID, 'check'))).click() wdw.until(EC.element_to_be_clickable((By.ID, 'generate'))).click() link = wdw.until(EC.visibility_of_element_located((By.XPATH, '//form[@class="center"]'))).get_attribute('action') except (WebDriverException, NoSuchElementException, TimeoutException): return False finally: self.browser.quit() return link
def run_get_logic(driver: PhantomJS, command_id, token): if not token: return {"code": 103, "public": "Session troubles!"} driver.add_cookie({ 'name': 'token', 'value': token, 'domain': "." + command_id.split(":")[0], 'path': '/' }) driver.get("http://{}/cabinet".format(command_id)) try: flag_there = driver.find_element_by_xpath('//html//body//div//h5//i') flag_container = flag_there.get_attribute('innerHTML') return flag_container except NoSuchElementException as e: return "error_no_flag_in_cabinet"
def run_get_logic(driver: PhantomJS, comand_id, post, flag, cookies): if 'sessions' not in cookies: return {"code": MUMBLE, "public": "Session troubles!"} driver.add_cookie({ 'name': 'sessions', 'value': cookies['sessions'], 'domain': "." + comand_id.split(":")[0], 'path': '/' }) driver.get("http://{}/{}".format(comand_id, post)) try: flag_there = driver.find_element_by_xpath('//li/a[@href="#"]') flag_container = flag_there.get_attribute('innerHTML') if flag in flag_container: return {"code": OK} else: return {"code": CORRUPT, "public": "Can't find my private data!"} except NoSuchElementException: return {"code": CORRUPT, "public": "Can't find my private data!"}
def render(gist_id, commit): block_url = 'http://bl.ocks.org/' + gist_id d3_block_rec = {'gist_id': gist_id} try: driver = PhantomJS() driver.get(block_url) time.sleep(RENDER_DELAY) # let it render fullpage_im = Image.open(BytesIO(driver.get_screenshot_as_png())) fimb = BytesIO() fullpage_im.save(fimb, 'png') d3_block_rec['fullpage_base64'] = base64.b64encode(fimb.getvalue()) d3_block_rec['block_url'] = driver.current_url except Exception as e: # we got nothing with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg: d3_block_rec['error'] = str(e) pg.insert('d3_block', values=d3_block_rec) exit(10) try: f = driver.find_element_by_xpath('//iframe') x, y = int(f.location['x']), int(f.location['y']) w, h = x + int(f.size['width']), y + int(f.size['height']) block_im = fullpage_im.crop((x, y, w, h)) bimb = BytesIO() block_im.save(bimb, 'png') d3_block_rec['block_base64'] = base64.b64encode(bimb.getvalue()) d3_block_rec['block_size'] = list(block_im.size) except Exception as e: # at least we got the fullpage im, save it with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg: d3_block_rec['error'] = str(e) pg.insert('d3_block', values=d3_block_rec) exit(11) # all good, save everything with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg: pg.insert('d3_block', values=d3_block_rec)
class CNStock(SentimentCrawler): def __init__(self): super().__init__(init=False) self.driver = PhantomJS() self.driver.maximize_window() self.wait = WebDriverWait(self.driver, 15) self.url = 'http://www.cnstock.com/' self.name = '中国证券网' def crawl_main_page(self, keyword): self.driver.set_page_load_timeout(10) try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until( ec.presence_of_element_located((By.ID, 'nav_keywords'))) except: CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR) self.driver.find_element_by_id('nav_keywords').clear() self.driver.find_element_by_id('nav_keywords').send_keys(keyword + Keys.ENTER) return self.crawl_search_results() def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_class_name( 'g').text item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'cnstock'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'des').text item.title = each_article.find_element_by_tag_name( 'a').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) pass
from selenium.webdriver import PhantomJS driver = PhantomJS( executable_path= r'E:\Documents\Apps\phantomjs-2.1.1-windows\bin\phantomjs.exe') url = 'http://cxwh.kexing100.com:82/?app_act=detail&id=328&from=groupmessage' driver.get(url) while True: driver.refresh() print driver.find_element_by_xpath("//div[@class='xinfo']").text # driver.execute_script("return localStorage.setItem('toupiao','0')") driver.execute_script("return localStorage.removeItem('toupiao')") driver.delete_all_cookies() driver.refresh() vote = driver.find_element_by_xpath("//span/input[@class='btn1']").click() # break
def get_applications_in_page(self, scroll_script): applications = [] driver = None try: desired_capabilities = dict(DesiredCapabilities.PHANTOMJS) desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url) service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))] driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args) # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy) if self.proxy_test: driver.get('http://curlmyip.com/') ip = driver.find_element_by_xpath('//body//pre').text print('ip : [ ' + ip + ' ]') pass else: driver.get(self.url) driver.execute_script(scroll_script) acknowledge = 0 done = False while not done: scroll_finished = driver.execute_script("return scraperLoadCompleted") if scroll_finished: if acknowledge == self.acknowledgements: done = driver.execute_script("return scraperLoadCompleted") pass else: acknowledge += 1 pass pass else: acknowledge = 0 pass time.sleep(5) # Wait before retry pass product_matrix = driver.find_elements_by_class_name("card") for application in product_matrix: extracted_application = self.extract_application_data(application) # if extracted_application['app_price'] != -1: applications.append(extracted_application) #pass pass pass driver.quit() pass except Exception as e: if driver is not None: driver.quit() pass if self.attempt < self.retries: self.attempt += 1 time.sleep(10) print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]' applications = self.get_applications_in_page(scroll_script) pass else: print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]') pass pass return applications pass
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
class plugin: def __init__(self): APP_ROOT = os.path.dirname(os.path.abspath(__file__)) print(APP_ROOT) self.req = 0 self.driver = PhantomJS(APP_ROOT + "/phantomjs", service_log_path=os.path.devnull) self.driver.implicitly_wait(3) def restart(self): self.__init__() def frame_search(self, path): framedict = {} for child_frame in self.driver.find_elements_by_tag_name('frame'): child_frame_name = child_frame.get_attribute('name') framedict[child_frame_name] = {'framepath': path, 'children': {}} xpath = '//frame[@name="{}"]'.format(child_frame_name) self.driver.switch_to.frame( self.driver.find_element_by_xpath(xpath)) framedict[child_frame_name]['children'] = self.frame_search( framedict[child_frame_name]['framepath'] + [child_frame_name]) self.driver.switch_to.default_content() if len(framedict[child_frame_name]['framepath']) > 0: for parent in framedict[child_frame_name]['framepath']: parent_xpath = '//frame[@name="{}"]'.format(parent) self.driver.switch_to.frame( self.driver.find_element_by_xpath(parent_xpath)) return framedict def tmon(self): self.driver.get( "https://login.ticketmonster.co.kr/user/loginform?return_url=") self.driver.find_element_by_name('userid').send_keys( config['ACCOUNT']['tmon_id']) self.driver.find_element_by_name('password').send_keys( config['ACCOUNT']['tmon_pw']) self.driver.find_element_by_xpath('//*[@id="loginFrm"]/a[2]').click() self.driver.get( 'http://m.benefit.ticketmonster.co.kr/promotions/page/attendance?view_mode=app' ) self.driver.find_element_by_xpath( '//*[@id="attn_wrap"]/div/div/div[3]/div[2]/div[1]/button').click( ) print(self.driver.find_element_by_class_name('content').text) self.tmon_ret = self.driver.find_element_by_class_name('content').text def ondisk(self): try: self.driver.get("http://ondisk.co.kr/index.php") self.driver.implicitly_wait(3) self.driver.find_element_by_xpath('//*[@id="mb_id"]').send_keys( config['ACCOUNT']['ondisk_id']) self.driver.find_element_by_xpath( '//*[@id="page-login"]/form/div[2]/p[2]/input').send_keys( config['ACCOUNT']['ondisk_pw']) self.driver.find_element_by_xpath( '//*[@id="page-login"]/form/div[2]/p[3]/input').click() self.driver.get( "http://ondisk.co.kr/index.php?mode=eventMarge&sm=event&action=view&idx=746&event_page=1" ) self.driver.switch_to_frame(1) self.driver.execute_script( "window.alert = function(msg){ window.msg = msg; };") self.driver.find_element_by_class_name('button').click() alert_text = self.driver.execute_script("return window.msg;") print(alert_text) except: print("ERR") print(self.driver.page_source) self.ondisk_ret = alert_text def ok_cash_bag(self): today = datetime.datetime.now().strftime("%Y%m%d") sess = requests.session() getdata = sess.get( "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101" ) param = { "lsd": "AVpmy4vJ", "api_key": "645711852239977", "cancel_url": "https://member.okcashbag.com/ocb/socialId/facebookProcessor?error=access_denied&error_code=200&error_description=Permissions+error&error_reason=user_denied#_=_", "display": "page", "enable_profile_selector": "", "isprivate": "", "legacy_return": "0", "profile_selector_ids": "", "return_session": "", "skip_api_login": "******", "signed_next": "1", "trynum": "1", "timezone": "-540", "lgndim": "eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNDAsImMiOjI0fQ==", "lgnrnd": "173648_UqkK", "lgnjs": "1528418208", "email": config['ACCOUNT']['fb_id'], "pass": config['ACCOUNT']['fb_pw'], "prefill_contact_point": config['ACCOUNT']['fb_id'], "prefill_source": "last_login", "prefill_type": "contact_point", "first_prefill_source": "last_login", "first_prefill_type": "contact_point", "had_cp_prefilled": "true", "had_password_prefilled": "false" } postdata = sess.post( "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101", data=param) # print(postdata.text) postdata = sess.post( "https://member.okcashbag.com//ocb/socialId/socialIdLoginProcess/42100/687474703A2F2F7777772e6f6b636173686261672e636f6d2F696e6465782e646f3F6c6f67696e3D59" ) samlResponse = postdata.text.split("samlResponse.value = \"")[1].split( "\"")[0] # print(samlResponse) param = {"samlResponse": samlResponse, "sst_cd": "", "return_url": ""} postdata = sess.post("http://www.okcashbag.com/index.do?login=Y", data=param) print( postdata.text.split('<span id="profileNickname" class="name">') [1].split("</span>")[0] + "님 로그인") print( postdata.text.split('<span id="spanUsablePoint">')[1].split( '</span>')[0] + "포인트") getdata = sess.get( "http://www.okcashbag.com/life/event/attend/attendMain.do") param = {"method": "", "myUrl": "", "recommUser": "", "today": today} postdata = sess.post( "http://www.okcashbag.com/life/event/attend/attend.do", data=param) print(postdata.text) if len(postdata.text.split('<i class="win-point">')) > 1: print(postdata.text.split('<i class="win-point">')[1] + "포인트 적립") elif len(postdata.text.split("success")) > 1: print("출석체크 완료 ") self.ok_ret = "출석체크 완료" else: print('이미 출석체크 완료') self.ok_ret = "이미 출석체크 완료"
class InspectAddress(object): def __init__(self): dcap = dict(DesiredCapabilities.PHANTOMJS) # 设置userAgent dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 " ) self.driver = PhantomJS( executable_path=r'phantomjs-2.1.1-windows\bin\phantomjs.exe', desired_capabilities=dcap) def get_dev_cookie(self): logurl = 'https://www.bidinghuo.cn/api/backend/login.json' # jsondata_url = 'https://www.bidinghuo.cn/api/backend/platform/query.json' headers = { 'Content-Type': 'application/json;charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } data = { u'username': config.developers_account[0], u'password': config.developers_account[1] } value = '' try: res = requests.post(logurl, data=data) if res.status_code == 200: print u'开发平台账户登录-成功' value = res.cookies['laravel_session'] else: print u'开发平台账户登录-失败' except: print u'开发平台账户登录-失败' cookies = { u'domain': u'.bidinghuo.cn', u'secure': False, u'value': value, u'expiry': None, u'path': u'/', u'httpOnly': True, u'name': u'laravel_session' } return cookies def get_brand_cookie(self): logurl = 'https://pyf123.bidinghuo.cn/api/admin/login.json' headers = { 'Content-Type': 'application/json;charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } data = { u'username': config.brand_user[0], u'password': config.brand_user[1] } value = '' try: res = requests.post(logurl, data=data) if res.status_code == 200: print u'品牌商账户登录-成功' value = res.cookies['laravel_session'] else: print u'品牌商账户登录-失败' except: print u'品牌商账户登录-失败' cookies = { u'domain': u'.bidinghuo.cn', u'secure': False, u'value': value, u'expiry': None, u'path': u'/', u'httpOnly': True, u'name': u'laravel_session' } return cookies def developer_platform(self): '''访问品牌商管理开发平台''' url = config.developers_platform try: self.driver.add_cookie(self.get_dev_cookie()) self.driver.get(url) self.driver.set_page_load_timeout(30) except: print u'访问品牌商管理开发平台-异常' try: page = self.driver.page_source page_soup = BeautifulSoup(page) username = page_soup.find_all(class_='user-name')[0] assert username.string == config.developers_account[0] print u'品牌商管理开发平台-访问正常' except: print u'品牌商管理开发平台-访问异常' def brand_platform(self): '''访问品牌商后台''' url = config.brand_platform try: self.driver.add_cookie(self.get_brand_cookie()) self.driver.get(url) self.driver.set_page_load_timeout(30) bdh_title = BeautifulSoup( self.driver.page_source).find_all(class_='ovh')[0].h2.string nsgj_title = BeautifulSoup( self.driver.page_source).find_all(class_='ovh')[1].h2.string assert bdh_title == u'必订火' assert nsgj_title == u'内审管家' print u'访问品牌商后台-正常' except: print u'访问品牌商后台-异常' try: page = self.driver.page_source nsgj_href = self.driver.find_element_by_xpath( '//*[@id="app"]/div[2]/div/div[2]/div/div[2]/a').get_attribute( 'href') bdh_href = self.driver.find_element_by_xpath( '//*[@id="app"]/div[2]/div/div[1]/div/div[2]/a').get_attribute( 'href') assert requests.get(bdh_href).status_code == 200 self.driver.get(bdh_href) self.driver.set_page_load_timeout(30) dhh_title = BeautifulSoup(self.driver.page_source).find_all( class_='meeting-name text-overflow')[0].string assert dhh_title == u'测试订货会' print u'访问品牌商订货会-正常' except: print u'访问品牌商订货会-异常' try: assert requests.get(bdh_href).status_code == 200 self.driver.get(nsgj_href) self.driver.set_page_load_timeout(30) nsh_title = BeautifulSoup(self.driver.page_source).find_all( class_='meeting-name text-overflow')[0].string assert nsh_title == u'认同与人体' print u'访问品牌商内审管家-正常' except: print u'访问品牌商内审管家-异常'
class ProviderAdvancedViewTests(LiveServerTestCase): def setUp(self): self.driver = PhantomJS() self.user = User.objects.create_user('admin', '*****@*****.**', 'password') self.user.save() self.provider = Provider( name='provider', user=self.user, ) self.provider.save() self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider) self.login() def tearDown(self): self.driver.quit() def open(self, url): self.driver.get("%s%s" % (self.live_server_url, url)) def login(self): self.open(settings.LOGIN_URL) self.driver.find_element_by_id("id_username").send_keys("admin") self.driver.find_element_by_id("id_password").send_keys("password") self.driver.find_element_by_css_selector("button.btn.btn-default").click() self.assertEqual( self.driver.current_url, self.live_server_url + reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]), ) def test_can_login(self): """ Test that the user can login """ pass def test_provider_page_has_all_data(self): """ Test that the provider statistics page has all the correct data """ self.open(reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk])) self.assertEqual("Open Ads", self.driver.title) self.assertIn( "{0} advertisements".format(self.provider.name), self.driver.find_element_by_css_selector("h1.page-header").text ) self.assertIn( "{0} advertisements in rotation".format(20), self.driver.find_element_by_css_selector("h1.page-header").text ) def test_advertisement_page_has_all_data(self): """ Test that the advertisement page has all the correct data """ for advert in self.provider_adverts: self.open(reverse('advertisements.views.view_advert_statistics', args=[advert.pk])) self.assertIn( "ID number: {0}".format(advert.pk), self.driver.find_element_by_css_selector("h1.page-header").text, ) self.driver.find_element_by_css_selector("img") self.assertEqual("Active", self.driver.find_element_by_xpath("//td[2]/span").text) self.assertEqual(advert.url, self.driver.find_element_by_link_text(advert.url).text) self.driver.find_element_by_link_text("Edit URL").click() self.assertEqual(advert.url, self.driver.find_element_by_id("id_url").get_attribute("value"))
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
# -*- coding: utf-8 -*- """ 另类爬虫selenium @author: 肖 """ # 首先进行安装 执行: pip install -U selenium / pip install selenium # test1: from selenium.webdriver import PhantomJS import re web = PhantomJS(executable_path='F:/phantomjs-2.1.1-windows/bin/phantomjs' ) # 需要设置PhantomJS的路径,否则无法运行 web.get('http://www.baidu.com') # 获取网页 print(web.page_source) # 打印网页源代码 print(web.page_source[0:300]) # 打印网页部分源代码,前面300个元素 web.get_screenshot_as_file( './baidu.png') # 网页截图,可以看到一个网页图片,以png的格式保存到指定位置,名称为baidu.png # test2: element = web.find_element_by_xpath('//*[@id="kw"]') # 通过xpath方式进行获取,定位百度的输入栏 element.send_keys('python') # 输入关键字 element = web.find_element_by_xpath( '//*[@id="su"]') # 定位百度的百度一下按钮,定位方式都是通过右键copy xpath即可,不是以往的xpath定位 element.click() # 促发按钮点击事件 web.get_screenshot_as_file( './baidusearch.png') # 网页截图,可以看到一个网页图片,以png的格式保存到指定位置,名称为baidusearch.png print(element) # 打印此时的element print(web.page_source) # 打印源代码 re.compile('<title>(.*?)</title>').findall(web.page_source) # 正则匹配而已