class Translator(threading.Thread): def __init__(self, queue, executable_path=None, desired_capabilities=None, service_args=None, google_translate_url=config['google_translate_url'], window_size=config['window_size']): super(self.__class__, self).__init__() self._queue = queue kwargs = {} if executable_path is not None: kwargs['executable_path'] = executable_path if desired_capabilities is not None: kwargs['desired_capabilities'] = desired_capabilities if service_args is not None: kwargs['service_args'] = service_args self._driver = PhantomJS(**kwargs) self._driver.set_window_size(*window_size) self._driver.get(google_translate_url) def run(self): while True: task = self._queue.get() if task is None: self._queue.task_done() self._driver.quit() break task.do(self._driver) self._queue.task_done()
class SeleniumTestCase(LiveServerTestCase): def _pre_setup(self): super(SeleniumTestCase, self)._pre_setup() self.driver = PhantomJS() def _post_teardown(self): self.driver.quit() super(SeleniumTestCase, self)._post_teardown() def login(self, username='******', password='******', url='login'): """ Login to the server and be authenticated """ self.open(reverse(url)) self.driver.find_element_by_id("id_username").clear() self.driver.find_element_by_id("id_username").send_keys(username) self.driver.find_element_by_id("id_password").clear() self.driver.find_element_by_id("id_password").send_keys(password) self.driver.find_element_by_id("submit-id-login").click() def open(self, url): self.driver.get("%s%s" %(self.live_server_url, url)) def is_element_present(self, how, what): try: self.driver.find_element(by=how, value=what) except NoSuchElementException, e: return False return True
def _init_robot(self, id): robot = WDriver() logging.debug("initialize") self.robots.update({str(id): robot}) logging.debug("get facebook.com") robot.get('http://fb.com') logging.debug("login") robot.find_element_by_name('email').send_keys('*****@*****.**') robot.find_element_by_name('pass').send_keys('2855930022040') robot.find_element_by_name('pass').send_keys(Keys.RETURN) for index in range(len(self.remain_ids)): self.lock.acquire() user_id = self.remain_ids.pop() self.lock.release() try: self.get_name_for_id(robot, user_id) except: logging.debug("error while updating record with id=%s" % str(user_id)) self.error_ids.add(user_id) else: self.done_ids.add(user_id) robot.close() return
def export(plot, filename, width=800, height=600): """ Export plot to file. Args: plot (quorra.Plot): Quorra plot object to export. width (int): Width for plot (pixels). height (int): Height for plot (pixels). filename (str): Filename to export to. """ global _phantom, __templates__, __cwd__ if _phantom is None: from selenium.webdriver import PhantomJS _phantom = PhantomJS(service_log_path=os.path.devnull) tmpl = os.path.join(__templates__, 'export.html') exp = os.path.join(__cwd__, '.' + str(uuid.uuid1()) + '.html') try: with open(tmpl, 'r') as fi, open(exp, 'w') as fo: dat = fi.read() dat = dat.replace('var plot = undefined;', 'var plot = {};'.format(str(plot))) dat = dat.replace('width: 800px;', 'width: {}px;'.format(width)) dat = dat.replace('height: 500px;', 'height: {}px;'.format(height)) fo.write(dat) _phantom.get('file://' + exp) _phantom.save_screenshot(filename.replace('.png', '') + '.png') finally: if os.path.exists(exp): os.remove(exp) return
def selenium(self, webdriverOption=0): """ # 调用浏览器下载,适用于任何情形 :return: """ if not self.url[:4] == "http": return None driver = None if webdriverOption == 0: from selenium.webdriver import PhantomJS driver = PhantomJS() elif webdriverOption == 1: from selenium.webdriver import Chrome driver = Chrome() elif webdriverOption == 2: from selenium.webdriver import Firefox driver = Firefox() if not driver: print(u"-->DownLoader->Selenium driver初始化出错,请检查运行环境或webdriverOption选项") driver.get(self.url) src = driver.page_source driver.quit() self.pageSource = src return src
def test_plotly(remove_build): """Tests plotly.""" viz = Plotly() ctrl = Nouislider() ctrl2 = Button() path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'build') layout = Layout(directory=path) layout.add(viz) layout.add_sidebar(ctrl) layout.add_sidebar(ctrl2) layout.subscribe(callback, ctrl.on_change) layout.subscribe(callback, ctrl2.on_click) layout.build() env = os.environ env['PYTHONPATH'] = '{}:{}'.format(os.getcwd(), os.environ.get('PYTHONPATH', '')) server = subprocess.Popen(os.path.join(path, 'src/server.py'), env=env) time.sleep(5) driver = PhantomJS() driver.get('http://localhost:9991') assert driver.title == 'Bowtie App' server.kill()
def scrape_statuses(self): headless_browser = PhantomJS() headless_browser.get(MTA_URL) soup = BeautifulSoup(headless_browser.page_source, "html.parser") for line_name in LINES: line = self.get_line(soup, line_name) self.lines.append(line)
def generate_image(structure): image_path = os.path.join(mkdtemp(), 'okc.png') html_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'okc.html', ) url = 'file://{}'.format(html_path) driver = PhantomJS(service_log_path=mkstemp()[1]) driver.set_window_size(2000, 500) driver.get(url) driver.execute_script('setText({});'.format(json.dumps(structure))) if random() > 0.4: driver.execute_script('hideForm();') elif random() > 0.5: driver.execute_script('uncheckForm();') driver.set_window_size(*driver.execute_script('return getSize();')) driver.save_screenshot(image_path) # twitter's gonna make our beautiful screenshot a jpeg unless we make it # think that we're using transparency for a reason, so,, img = Image.open(image_path) origin = img.getpixel((0, 0)) new_origin = origin[:3] + (254,) img.putpixel((0, 0), new_origin) img.save(image_path) subprocess.check_call(['optipng', '-quiet', image_path]) return image_path
class KeywordTool(object): sources = {'google', 'youtube', 'bing', 'amazon', 'ebay', 'app-store'} def __init__(self, source='google', timeout=5): self.source = source self.base_url = None self.timeout = timeout self.driver = PhantomJS() self.driver.get(self.base_url) def search(self, search_term): if self.current_url != self.base_url: self.source = self.source # forces page load self.driver.find_element_by_xpath( '//input[@id="edit-keyword"]').send_keys(search_term) self.driver.find_element_by_xpath( '//button[@id="edit-submit"]').click() """Wait for at least one element to load. In practice, most of them load. You can't get them all without scrolling.""" element_not_present = EC.invisibility_of_element_located( (By.XPATH, '//td[@class="col-keywords"]//div')) WebDriverWait(self.driver, self.timeout).until(element_not_present) def parse(self): tree = html.fromstring(self.driver.page_source) L = tree.xpath('//td[@class="col-keywords"]//text()') L = map(lambda s: s.strip(), ''.join(L).split('\n')) return [s for s in L if s] def get_keywords(self, search_term, source='google'): if self.source != source: self.source = source self.search(search_term) return self.parse() @property def source(self): return self._source @source.setter def source(self, val): self._source = val if val in self.sources else 'google' if 'driver' in self.__dict__: self.driver.get(self.base_url) @property def base_url(self): return ''.join(['https://keywordtool.io/', self.source]) @base_url.setter def base_url(self, val): pass @property def current_url(self): return self.driver.current_url @current_url.setter def current_url(self, val): pass
def is_logged_in(browser: PhantomJS): browser.get('https://tiki.vn/sales/order/history/') full_name = browser.find_element_by_css_selector('.profiles > h6:nth-child(3)') if full_name.text: logger.info("You has been login with name : {}".format(full_name.text)) return True else: return False
def main(): global HEAD if len(sys.argv) > 1: try: HEAD = int(sys.argv[1]) except: HEAD = 10 # test mirror list mirror_list = read_mirrors() for i in mirror_list: try: cururl = i print("Testing:",i) res = request.urlopen(i) except: print("Testing on",i,"failed") continue try: update_mirrors(cururl) break; except: continue; try: res except: raise Warning('All mirrors unavailable!') print('Available mirror:',cururl) # get vpn table countries = dict() dr = PhantomJS() dr.get(cururl) page = Selector(text=dr.page_source)\ .xpath('.//td[@id="vpngate_inner_contents_td"]/' 'table[@id="vg_hosts_table_id"]//tr') if HEAD < len(page): page = page[:HEAD] print('Pagelen:',len(page)) for vpn in page: if len(vpn.xpath('./td[@class="vg_table_header"]')) > 0: continue row = vpn.xpath('./td') country = row[0].xpath('./text()').extract_first() country = '_'.join(country.split(' ')) ovpn = row[6].xpath('./a/@href').extract_first() if ovpn: if country in countries: countries[country] += 1 get_ovpn(url=cururl+ovpn, save_to=country+'/'+str(countries[country])) else: countries[country] = 0 if not os.path.exists(country): os.mkdir(country) get_ovpn(url=cururl+ovpn, save_to=country+'/'+str(countries[country])) dr.quit()
def getHtmlSource(url, time=10): driver = PhantomJS(service_args=[ '--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false' ]) driver.get(url) WebDriverWait(driver, timeout=time) source = driver.page_source #driver.save_screenshot('a.png') return source
class Crawler: def __init__(self, timeout=20, phantomjs_cfg_file='python-utils/config/phantomjs_cfg.json', use_cfg_file=False, proxy_pool_server='http://127.0.0.1:15110'): self.timeout = timeout if use_cfg_file: phantomjs_service_args = ['--config={}'.format(phantomjs_cfg_file)] else: _, proxy_type, proxy, proxy_auth = get_proxy(proxy_pool_server) phantomjs_service_args = [ '--proxy-type={}'.format(proxy_type), '--proxy={}'.format(proxy), '--proxy-auth={}'.format(proxy_auth), ] self.driver = PhantomJS( desired_capabilities=self.new_desired_capabilities(), service_args=phantomjs_service_args) self.check_client_info() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def close(self): self.driver.quit() @contextmanager def wait_for_page_load(self, old_element): yield WebDriverWait(self.driver, self.timeout).until(EC.staleness_of(old_element)) def new_desired_capabilities(self, user_agent=default_ua): desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() if not user_agent: user_agent = ua.random desired_capabilities["phantomjs.page.settings.userAgent"] = user_agent return desired_capabilities def check_client_info(self): url='http://www.whoishostingthis.com/tools/user-agent/' self.driver.get(url) ip_addr = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[2]/span').text.strip() user_agent = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[1]').text.strip() logger.info('IP: {}, User-Agent: {}'.format(ip_addr, user_agent)) if self.wrong_ip(ip_addr): logger.error('Proxy not set correctly!') sys.exit(-1) def wrong_ip(self, ip_addr): if ip_addr.startswith('166.111.') or ip_addr.startswith('59.66.') or ip_addr.startswith('101.5.') or ip_addr.startswith('101.6.'): return True else: return False
def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser() parser.add_argument('--url', default='http://127.0.0.1:8000/static/index.html') args = parser.parse_args(argv) url = args.url browser = WebDriver() browser.get(url) tags = browser.find_elements_by_css_selector('li') for tag in tags: print(tag.text) browser.close()
def catalog_url(url='http://www.meitun.com/'): # catalog_url is AJAX,use phantomJS driver = PhantomJS() driver.get(url) driver.maximize_window() mov_ele = driver.find_element_by_css_selector('.nav>ul>li:nth-child(1)') # the mouse move to the lazy layout element,and perform ActionChains(driver).move_to_element(mov_ele).perform() time.sleep(3) response = driver.page_source driver.quit() # use pyquery parser the page source,more quickly d = pq(response) return map(lambda x: 'http:' + pq(x).attr('href'), d.find('.cg-pdts a'))
def onegoogolePR(self, url): '''返回单个PR''' prUrl = 'http://pr.chinaz.com' # 谷歌PR查询地址 driver = PhantomJS() driver.get(prUrl) driver.find_element_by_id('PRAddress').send_keys(url) driver.find_element_by_class_name('search-write-btn').click() try: imgsrc = driver.find_element_by_css_selector('span#pr>img').get_attribute('src') pr = search(r'\d', imgsrc).group() except: pr = '暂无数据' driver.quit() return pr
class AdvertisementAdvancedViewTests(LiveServerTestCase): def setUp(self): self.driver = PhantomJS() self.user = User.objects.create_user('admin', '*****@*****.**', 'pass') self.user.save() self.provider = Provider( name='provider', user=self.user, ) self.provider.save() self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider) def tearDown(self): self.driver.quit() def open(self, url): self.driver.get("%s%s" % (self.live_server_url, url)) def test_side_ad_display(self): """ Test that the side ads display properly """ self.open(reverse('advertisements.views.side_ads')) self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 4) self.driver.find_element_by_xpath("//a[1]/img") self.driver.find_element_by_xpath("//a[2]/img") self.driver.find_element_by_xpath("//a[3]/img") self.driver.find_element_by_xpath("//a[4]/img") self.assertNotEqual(self.driver.find_element_by_xpath("//a[1]").get_attribute("href"), '') self.assertNotEqual(self.driver.find_element_by_xpath("//a[2]").get_attribute("href"), '') self.assertNotEqual(self.driver.find_element_by_xpath("//a[3]").get_attribute("href"), '') self.assertNotEqual(self.driver.find_element_by_xpath("//a[4]").get_attribute("href"), '') def test_top_ad_display(self): """ Test that the top ad displays properly """ self.open(reverse('advertisements.views.top_ad')) self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 1) self.driver.find_element_by_xpath("//a/img") self.assertNotEqual(self.driver.find_element_by_xpath("//a").get_attribute("href"), '')
class Client: def __init__(self, ig_id): self.b = PhantomJS() self.ig_id = ig_id self.b.get('https://instagram.com/%s' % ig_id) def close(self): self.b.close() def get_media(self) -> list: js = self.b.execute_script('return window._sharedData;') ed = js['entry_data'] pp = ed['PostPage'][0] g = pp['graphql'] sc = g['shortcode_media'] if sc['__typename'] == 'GraphSidecar': edges = sc['edge_sidecar_to_children']['edges'] medias = list( map( lambda x: { 'id': x['node']['id'], 'url': x['node']['display_url'], 'caption': x['node']['accessibility_caption'] }, edges)) elif sc['__typename'] == 'GraphImage': medias = [{ 'id': sc['id'], 'url': sc['display_url'], 'caption': sc['accessibility_caption'] }] return list( filter( lambda x: 'person' in x['caption'] or 'people' in x['caption'], medias)) def get_user(self) -> dict: js = self.b.execute_script('return window._sharedData;') ed = js['entry_data'] pp = ed['ProfilePage'][0] g = pp['graphql'] return g['user'] def get_posts(self) -> set: ps = self.b.find_elements_by_css_selector('a[href^="/p/"]') return set(map(lambda x: x.get_attribute('href'), ps)) def scroll(self): self.b.execute_script('window.scroll(0, document.body.scrollHeight);')
def run_get_logic(driver: PhantomJS, command_id, token): if not token: return {"code": 103, "public": "Session troubles!"} driver.add_cookie({ 'name': 'token', 'value': token, 'domain': "." + command_id.split(":")[0], 'path': '/' }) driver.get("http://{}/cabinet".format(command_id)) try: flag_there = driver.find_element_by_xpath('//html//body//div//h5//i') flag_container = flag_there.get_attribute('innerHTML') return flag_container except NoSuchElementException as e: return "error_no_flag_in_cabinet"
def start(n, comic_url): urllists.append(comic_url) driver = PhantomJS() driver.get(comic_url) get_images_url(n, driver, comic_url) while True: try: driver.find_element_by_xpath( "//li[@id='next_item']/a[@id='mainControlNext']").click() comic_url = driver.current_url if comic_url not in urllists: urllists.append(comic_url) get_images_url(n, driver, comic_url) driver.find_element_by_xpath( "//li[@id='next_item']/a[@id='mainControlNext']").click() # print n + '\t' + comic_url except: print 'All done!' break
def check_agree(link, soup): # Agree if asked to (click on accept) if soup.find('input', {'id': 'ctl00_mainContentArea_disclaimerContent_yesButton'}): print("Agreeing the terms of use - please wait...") driver = PhantomJS('.\phantomjs.exe' if platform. startswith('win32') else './phantomjs') driver.get(link) driver.find_element_by_id( 'ctl00_mainContentArea_disclaimerContent_yesButton').click() for cookie in driver.get_cookies(): s.cookies.set(cookie['name'], cookie['value']) driver.quit() resp_inner = s.get(link) soup = Soup(resp_inner.text, features="lxml") print("Done, now let's get back to the scraping process.") return soup
def on_start_again(self, url): driver = PhantomJS() driver.get(url) time.sleep(2) driver.maximize_window() t = driver.find_element_by_css_selector('.page-txt').text res_t = [] if t: t = int(t.split('/')[1][:-1]) - 1 # get the page count # the count of page turning should be i-1 while t: t -= 1 move_ele = driver.find_element_by_css_selector('#next') ActionChains(driver).move_to_element(move_ele).click() time.sleep(1) res_t.append(driver.page_source) driver.quit() for item in res_t: self.step_first(item)
class Premiumgeneratorlink(object): def __init__(self, url): self.url = url self.browser = PhantomJS() def get_link(self): try: self.browser.get('http://premiumgeneratorlink.com/') self.browser.find_element_by_name('link').send_keys(self.url) self.browser.find_element_by_xpath('//a[@class="input"]').click() wdw = WebDriverWait(self.browser, 10) wdw.until(EC.element_to_be_clickable((By.ID, 'check'))).click() wdw.until(EC.element_to_be_clickable((By.ID, 'generate'))).click() link = wdw.until(EC.visibility_of_element_located((By.XPATH, '//form[@class="center"]'))).get_attribute('action') except (WebDriverException, NoSuchElementException, TimeoutException): return False finally: self.browser.quit() return link
def run_get_logic(driver: PhantomJS, comand_id, post, flag, cookies): if 'sessions' not in cookies: return {"code": MUMBLE, "public": "Session troubles!"} driver.add_cookie({ 'name': 'sessions', 'value': cookies['sessions'], 'domain': "." + comand_id.split(":")[0], 'path': '/' }) driver.get("http://{}/{}".format(comand_id, post)) try: flag_there = driver.find_element_by_xpath('//li/a[@href="#"]') flag_container = flag_there.get_attribute('innerHTML') if flag in flag_container: return {"code": OK} else: return {"code": CORRUPT, "public": "Can't find my private data!"} except NoSuchElementException: return {"code": CORRUPT, "public": "Can't find my private data!"}
def get_item(browser: PhantomJS, item: dict, check_inverter): price_expect = get_price(item['price']) max_retry = 5 retry = 0 while retry < max_retry: retry += 1 browser.get(item.get('url')) item_title = browser.find_element_by_css_selector('#product-name') item_name = item_title.text item_price = browser.find_element_by_css_selector('#span-price') logger.info("{} -> {}".format(item_price.text, item_name)) price_seller = get_price(item_price.text) screen_shot(browser=browser, file_name='buy.png', item_name=item_name) if price_seller <= price_expect: browser.find_element_by_css_selector('#\#mainAddToCart').click() return item_name else: logger.info("Retry : {}. {}".format(retry, item_title.text)) time.sleep(check_inverter)
class Leecherus(object): def __init__(self, url): self.url = url self.browser = PhantomJS() def get_link(self): try: self.browser.get('http://leecher.us') wdw = WebDriverWait(self.browser, 10) wdw.until(EC.visibility_of_element_located((By.NAME, 'link'))).send_keys(self.url) wdw.until(EC.element_to_be_clickable((By.XPATH, '//button[@class="subscribe"]'))).click() wdw.until(EC.element_to_be_clickable((By.XPATH, '//input[@class="subscribe"]'))).click() self.browser.switch_to_window(self.browser.window_handles[1]) onclick = wdw.until(EC.element_to_be_clickable((By.ID, 'get_link'))).get_attribute('onclick') except (WebDriverException, NoSuchElementException, TimeoutException, IndexError): return False finally: self.browser.quit() m = re.search("'(http://[^']+)'", onclick) return m.group(1) if m else False
def post(self): id = request.values['page'] page = Page.objects.get_or_404(id=id) # html = requests.get(page.baseurl).text screenshot = None try: phantom = PhantomJS(desired_capabilities={'acceptSslCerts': True}, service_args=['--web-security=false', '--ssl-protocol=any', '--ignore-ssl-errors=true'], port=8888) phantom.set_window_size(1024, 768) phantom.get(page.baseurl) html = phantom.page_source screenshot = phantom.get_screenshot_as_png() phantom.close() except Exception as ex: html = "error when i snap your page ... %s" % ex snap = Snap(html, datetime.datetime.now(), screenshot).save() page.update(push__snaps=snap) snap = Snap(html, datetime.datetime.now(), screenshot).save() page.update(push__snaps=snap) return jsonify({'id': "%s" % snap.id})
def render(gist_id, commit): block_url = 'http://bl.ocks.org/' + gist_id d3_block_rec = {'gist_id': gist_id} try: driver = PhantomJS() driver.get(block_url) time.sleep(RENDER_DELAY) # let it render fullpage_im = Image.open(BytesIO(driver.get_screenshot_as_png())) fimb = BytesIO() fullpage_im.save(fimb, 'png') d3_block_rec['fullpage_base64'] = base64.b64encode(fimb.getvalue()) d3_block_rec['block_url'] = driver.current_url except Exception as e: # we got nothing with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg: d3_block_rec['error'] = str(e) pg.insert('d3_block', values=d3_block_rec) exit(10) try: f = driver.find_element_by_xpath('//iframe') x, y = int(f.location['x']), int(f.location['y']) w, h = x + int(f.size['width']), y + int(f.size['height']) block_im = fullpage_im.crop((x, y, w, h)) bimb = BytesIO() block_im.save(bimb, 'png') d3_block_rec['block_base64'] = base64.b64encode(bimb.getvalue()) d3_block_rec['block_size'] = list(block_im.size) except Exception as e: # at least we got the fullpage im, save it with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg: d3_block_rec['error'] = str(e) pg.insert('d3_block', values=d3_block_rec) exit(11) # all good, save everything with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg: pg.insert('d3_block', values=d3_block_rec)
def run_get_logic(driver: PhantomJS, team_ip, user, password): session_cookie = beacons_api.sign_in(team_ip, user, password) print('cookie: ' + session_cookie) beacons = beacons_api.get_all_user_beacons(team_ip, session_cookie) if not beacons: print('no beacons') return {"code": 103} print('beacons: ' + str(beacons)) driver.get(f"http://{team_ip}:{SERVICE_PORT}/") driver.add_cookie({ 'name': 'session', 'value': session_cookie, 'domain': "." + team_ip, 'path': '/' }) for beacon_id in beacons: driver.get(f'http://{team_ip}:{SERVICE_PORT}/Beacon/{beacon_id}') print(driver.current_url) # print(driver.page_source) if beacon_id in driver.current_url: print('fine') return {"code": 101} return {'code': 110}
def get_video_src(self): #room url check o = urlparse(self.room_url) rule_key = o.netloc if rule_key in self.rules: cap = webdriver.DesiredCapabilities.PHANTOMJS cap["phantomjs.page.settings.resourceTimeout"] = 1000 cap["phantomjs.page.settings.loadImages"] = True cap["phantomjs.page.settings.disk-cache"] = True cap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0(iPhone;CPU iPhone OS 9_1 like Mac OSX) AppleWebKit / 601.1" ".46(KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1" driver = PhantomJS(self.driverPath["PhantomJS"], desired_capabilities=cap) # 指定使用的浏览器 # driver = webdriver.Firefox() driver.implicitly_wait(10) my_rule = self.rules[rule_key] url_prefix = my_rule["url_prefix"] driver.get("%s%s" % (url_prefix, o.path)) try: result_video = driver.find_element_by_tag_name( 'video').get_attribute('src') driver.close() return result_video except: # return "未能获得该直播间地址直播流地址" driver.close() return "主播不在了" # return " " else: return "不支持的网站(not support url)"
class CNStock(SentimentCrawler): def __init__(self): super().__init__(init=False) self.driver = PhantomJS() self.driver.maximize_window() self.wait = WebDriverWait(self.driver, 15) self.url = 'http://www.cnstock.com/' self.name = '中国证券网' def crawl_main_page(self, keyword): self.driver.set_page_load_timeout(10) try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until( ec.presence_of_element_located((By.ID, 'nav_keywords'))) except: CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR) self.driver.find_element_by_id('nav_keywords').clear() self.driver.find_element_by_id('nav_keywords').send_keys(keyword + Keys.ENTER) return self.crawl_search_results() def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_class_name( 'g').text item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'cnstock'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'des').text item.title = each_article.find_element_by_tag_name( 'a').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) pass
def get_tickers(stock_list): list = pd.read_csv(stock_list) try: tickers = [c for c in list[list.columns[1]]] except Exception as e: print(e) sys.exit(1) #List for the stocks that had some error being collected error_stocks = [] # Creates directory for the stock data CSV files if not os.path.exists('stock_dfs'): os.makedirs('stock_dfs') global source # When data collecting will start and end for the Dates global start global end print(f'>>>Getting Stock Data from {source} from {end}') #Iterating through each ticker for ticker in tqdm.tqdm(tickers): # Reading data on the stock. If grabbing todays data failes, tries to grab data from yesterday try: df = web.DataReader(ticker, source, start, end) except: #Changing end date to yesterday end = (dt.datetime.now() - dt.timedelta(1)).strftime('%Y-%m-%d') df = web.DataReader(ticker, source, start, end) #High/Low Open/Close percentage df['HL_pct'] = ((df['High'] - df['Low']) / df['Low']) * 100 df['OC_pct'] = ((df['Close'] - df['Open']) / df['Open']) * 100 #Boolinger Band df['Middle Boolinger'] = df['Adj Close'].rolling(20).mean() df['Sup_Boolinger'] = df['Middle Boolinger'] + ( 2 * df['Adj Close'].rolling(20).std()) df['Inf_Boolinger'] = df['Middle Boolinger'] - ( 2 * df['Adj Close'].rolling(20).std()) #Exponential Moving Mean df['Exp20_Close'] = df['Adj Close'].ewm(span=20, adjust=False).mean() #Expantion/Contraction of stock price df['Deviation_band'] = df['Adj Close'].rolling(20).std() #RSI change = df['Adj Close'].diff(1) gain = change.mask(change < 0, 0) loss = change.mask(change > 0, 0) avg_gain = gain.ewm(min_periods=rsi_period, com=rsi_period - 1).mean() avg_loss = loss.ewm(min_periods=rsi_period, com=rsi_period - 1).mean() rs = abs(avg_gain / avg_loss) df['RSI'] = 100 - (100 / (1 + rs)) ''' Now the code will do a webscrape on some pages on yahoo finance to get more details and info. It will do this by table reading or span-string reading since some pages don't have tables. With table reading it's straight up but with span reading we need to get the reactID of each line we want. And for that it's kind of hardcoded, I read through all the span lines and wrote down the useful ones. ''' #Reading into page resp = requests.get( f'https://finance.yahoo.com/quote/{ticker}/financials') #BeautifulSoup scrapes the page in TXT form soup = bs.BeautifulSoup(resp.text, 'lxml') #Number of span lines we got length = int(np.array(soup.find_all('span')).shape[0]) #All lines with the span class, which has the info we want lines = np.array(soup.find_all('span')) #List to store the span lines that have the reactID codes we want spans = [] #Dates we want to find find_dates = ['12/30/2019', '12/30/2018', '12/30/2017', '12/30/2016'] #List for the dates we actually find dates = [] #Iterating through the lines and grabbing all lines from the span class for line in range(0, length): spans.append(BeautifulSoup(str(lines[line]), features='lxml').span) #Iterating through each date we want to find in the website for date in find_dates: #Iterating through each span-class line for line in range(0, length): #If the text line and date match then put the date in the found dates list if spans[line].string == date: dates.append(spans[line].string) break #Changes date format for indexing with the webreader dataframe for index, date in enumerate(dates): #If any string dpesn't match the format than it's not a date and will be removed try: dates[index] = dt.datetime.strptime( date, "%m/%d/%Y").strftime("%Y-%m-%d") except: #dates.remove will raise exception when there is no more of such content in the list, stopping the loop removed = False while (removed == False): try: dates.remove(dates[index]) except: removed = True #Adding 3 days to the dates, because most stocks don't opperate on the last day of the year. Which is #the date time for the data to appear on the website. for index, date in enumerate(dates): dates[index] = (dt.datetime.strptime(date, '%Y-%m-%d') + dt.timedelta(3)).strftime('%Y-%m-%d') #Info we want to get from the webiste interesting_lines = [ 'Total Revenue', 'Cost of Revenue', 'Gross Profit', 'Selling General and Administrative', 'Total Operating Expenses', 'Operating Income or Loss', 'Interest Expense', 'Total Other Income/Expenses Net', 'Income Before Tax', 'Income Tax Expense', 'Income from Continuing Operations', 'Net Income', 'Net Income available to common shareholders', 'EBITDA' ] #List for the info we actually find on the website infos = [] #List for the ReactIDs of the lines that have the data about the infos above number_ids = [] #Column renaming column_names = [ 'Total Revenue (TTM)', 'Cost of Revenue (TTM)', 'Gross Profit (TTM)', 'Selling General and Administrative Expenses (TTM)', 'Total Operating Expenses (TTM)', 'Operating Income or Loss (TTM)', 'Interest Expense (TTM)', 'Total Other Income/Expenses Net', 'Income Before Tax (TTM)', 'Income Tax Expense (TTM)', 'Income from Coninuing Operations (TTM)', 'Net Income (TTM)', 'Net Income available to Shareholders (TTM)', 'EBITDA (TTM)' ] #Iterating through the informations we want for index, info in enumerate(interesting_lines): #Boolean for if the information was found check = False #Iterating through the span lines for line in range(0, length): #If line contains the information we want, appends it to the found infos list. if spans[line].string == info: infos.append(spans[line].string) #Appends the info's reactID +5, one line below, where the numbers and data are number_ids.append( str(int(spans[line]['data-reactid']) + 5)) check = True pass #In case the information isn't found, the respective column name is changed to a NAN, to be removed later if check == False: column_names[index] = np.nan #Removing NANs from column name list column_names = [c for c in column_names if str(c) != 'nan'] #Creating the columns for the information for column in column_names: df[f'{column}'] = np.nan #Iterating through dates, with indexing for index, date in enumerate(dates): #Iterating through new columns, with indexing for column, string in enumerate(column_names): #Iterating through span lines for line in range(0, length): #Fetching data for the respective information column in order if spans[line]['data-reactid'] == number_ids[column]: #Locates the date in dataframe index, formats the string of the data, turns it into a Integer and #puts the data in it's correct place in time. try: df[f'{string}'].loc[dates[index]] = int( (spans[line].string).replace(',', '')) except Exception as e: print(e) print( f'Error formating/alocating string to int for stock {ticker}' ) #Appending to stocks with errors list error_stocks.append(ticker) continue #Adding 2 to the IDs for each iteration so we get the lines of previous dates for the information number_ids = [int(c) for c in number_ids] number_ids = [c + 2 for c in number_ids] number_ids = [str(c) for c in number_ids] #Page URL that we will pass to PhantomJS url = f'https://finance.yahoo.com/quote/{ticker}/key-statistics' #Initiating PhantomJS driver = PhantomJS(executable_path=r'phantomjs.exe') #Opening URL with PhantomJS to fully load the page driver.get(url) #Returning page source after all the JavaScript codes have been loaded resp = driver.page_source #Closing PhantomJS driver.quit() #List of tables that Pandas found in the web page dfs = pd.read_html(resp) #Dataframe to put all the tables in just one key_stats = pd.DataFrame() #Iterating through the tables for dframe in dfs: #If dataframe is empty, passes the first table if key_stats.empty: key_stats = dframe #If it already has a table, appends the new ones else: key_stats = key_stats.append(dframe) #Fixing dataframe index, with numbers from 0 to length of dataframe key_stats.index = [c for c in range(0, key_stats.shape[0])] #There´s some info that we don´t have interest so we drop what we don´t need stats = key_stats.loc[:8] #Removing columns 0 and 1 stats = stats.drop([0, 1], axis=1) #Passing the information names as the dataframe index stats.index = [c for c in stats['Unnamed: 0'].values] #Removing the column with information names, since it´s all in the index stats = stats.drop(['Unnamed: 0'], axis=1) #Transposing the dataframe, so that the Dates become the index and the information names become the column stats = stats.transpose() #Criating the new columns in the main dataframe for column in stats.columns: df[f'{column}'] = np.nan #Putting all the dates in a list dates = [c for c in stats.index] #Iterating through the dates for index, date in enumerate(dates): #Changing date format try: dates[index] = dt.datetime.strptime( date, "%m/%d/%Y").strftime("%Y-%m-%d") except: #One of the dates actually has more things than the date so we remove all that date = date.replace('As of Date: ', '') date = date.replace('Current', '') dates[index] = dt.datetime.strptime( date, "%m/%d/%Y").strftime("%Y-%m-%d") #Adding 3 days because stocks don´t opperate in the last day of the year for index, date in enumerate(dates): dates[index] = (dt.datetime.strptime(date, '%Y-%m-%d') + dt.timedelta(3)).strftime('%Y-%m-%d') #Passing changed dates back into the dataframe´s index stats.index = dates #Iterating through dates again for date in stats.index: #Iterating through the new columns for column in stats.columns: #Locating the dates and columns in the main dataframe and putting the respetive data in it´s place try: df[f'{column}'].loc[date] = stats[f'{column}'].loc[date] #If any errr occurs in this process, shows the error for the respective stock and adds it to the #stocks-with-error list except Exception as e: print(e) print( f'Error formating/alocating string to int for stock {ticker}' ) #Appending to stocks with errors list error_stocks.append(ticker) ''' Since we only have info year by year and the .loc funtion only puts the data in the specific index, we need to fill the NANs with the previous data that isn't a NAN (ffill method). This way, from each data alocated, all future lines will have this exact data, until a new data (the most recent) appears, and the process repeats. ''' df.fillna(method='ffill', inplace=True) # Saving csv file df.to_csv('stock_dfs/{}.csv'.format(ticker)) #Showing any stocks with errors if there are any if error_stocks != []: print('\n ------ Inspect Errors ------- \n') print([c for c in error_stocks])
from selenium.webdriver import PhantomJS driver = PhantomJS( executable_path= r'E:\Documents\Apps\phantomjs-2.1.1-windows\bin\phantomjs.exe') url = 'http://cxwh.kexing100.com:82/?app_act=detail&id=328&from=groupmessage' driver.get(url) while True: driver.refresh() print driver.find_element_by_xpath("//div[@class='xinfo']").text # driver.execute_script("return localStorage.setItem('toupiao','0')") driver.execute_script("return localStorage.removeItem('toupiao')") driver.delete_all_cookies() driver.refresh() vote = driver.find_element_by_xpath("//span/input[@class='btn1']").click() # break
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
'http://www.greatschools.org/north-carolina/morrisville/3360-Cedar-Fork-Elementary/', ] indicators = ['EquityRaceEthnicity', 'EquityLowIncome', 'EquityDisabilities'] wd = PhantomJS() output_cols = [ 'school', 'url', 'students_per_grade', 'teachers_to_student', 'counselors_to_student', 'reading', 'math', 'science' ] output_df = DataFrame(columns=output_cols) output_ind = 0 for url in urls: t1 = time() wd.get(url) school_name = wd.title.split(' -')[0] print school_name, school_info = wd.find_elements_by_class_name('school-info__item') for s in school_info: inner_html = sub(r'<.*?>|\n', ' ', s.get_attribute('innerHTML')) inner_html = sub(r'\s+', ' ', inner_html).strip() if 'grades' in inner_html.lower(): min_grade, max_grade = inner_html.split(' ')[-1].split('-') if min_grade.lower() == 'pk': min_grade = -1 elif min_grade.lower() == 'k': min_grade = 0 n_grades = int(max_grade) - int(min_grade) + 1 elif 'students' in inner_html.lower():
class PagesCrawler(BaseSpider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error) def start_requests(self): self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO) self.log("ARGUMENTS : "+str(self.args), log.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join( scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID'] ) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS( executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles ) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def crashed(self, spider): self.errors += 1 self.closed("CRASH") def closed(self, reason): if self.errors: self.log("%s error%s encountered during the crawl." % (self.errors, 's' if self.errors > 1 else ''), log.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request) self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction") except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('./') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
def get_ratio_data(): import re import time from bs4 import BeautifulSoup from app.models import Company, Indicators from app.utils import cash_to_float, depercentize # Dict item with list: element attribute, attribute value to look for, optional transform function indicators = {'roe': { 'attribute': 'data-reactid', 'value': re.compile(".*RETURN_ON_EQUITY\.1$"), 'transform': depercentize, }, 'fcf': { 'attribute': 'data-reactid', 'value': re.compile(".*LEVERED_FREE_CASH_FLOW\.1$"), 'transform': cash_to_float, }, 'ev2ebitda': { 'attribute': 'data-reactid', 'value': re.compile(".*ENTERPRISE_VALUE_TO_EBITDA\.1$"), }, } companies = Company.query.with_entities(Company.symbol).all() symbols = [company[0] for company in companies] print("Iterate through symbols") for symbol in symbols: print("{} Fetching {} :".format(time.strftime("%H:%M:%S"), symbol)) #driver = MyWebDriver() retry_current = 0 retry_limit = 5 while retry_current < retry_limit: try: driver = PhantomJS() except URLError: time.sleep(retry_current**2) retry_current += 1 driver.set_window_size(1120, 550) driver.get("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) try: #WebDriverWait(driver, 10).until(EC.title_contains("AAPL Key Statistics | Apple Inc. Stock - Yahoo Finance")) #element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "td[reactid]"))) #element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//td[@data-reactid[ends-with(., 'RETURN_ON_EQUITY.1')]]"))) # these two seem to work... element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[substring(@data-reactid, string-length(@data-reactid) - string-length('RETURN_ON_EQUITY.1') +1) = 'RETURN_ON_EQUITY.1']"))) #element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[contains(@data-reactid,'RETURN_ON_EQUITY.1')]"))) #"//input[@id[ends-with(.,'register')]]" except TimeoutException as e: print "Caught", e print driver.title continue #time.sleep(5) #with open("{}.out".format(symbol), "w") as f: # f.write(driver.page_source.encode('utf-8')) soup = BeautifulSoup(driver.page_source, "lxml") d = {'symbol': symbol} for indicator in indicators.keys(): curr_ind = indicators[indicator] s = soup.find_all(attrs={curr_ind['attribute']: curr_ind['value']}) print indicator, s for element in s: if curr_ind.has_key('transform'): f = curr_ind['transform'] #print(f(element.text)) d[indicator] = f(element.text) else: #print(element.text) d[indicator] = element.text try: db.session.add(Indicators.from_json(d)) db.session.commit() except (IntegrityError, UnmappedInstanceError) as e: print "Caught", e db.session.rollback() print "indicators", d
class Client(object): """Client HTTP pour tester fonctionnellement Strass Adapteur du pilote Selenium, avec une interface inspirée de Nightwatch.js, et quelques paramètres spécifiques à Strass.""" def __init__(self): self.driver = PhantomJS() self.driver.set_window_size(1120, 550) def __del__(self): self.driver.quit() def get(self, query=None): server = os.environ.get('STRASS_TEST_SERVER', 'http://localhost:8000') url = server + (query or '/') self.driver.get(url) return self def find(self, selector): return self.driver.find_element_by_css_selector(selector) def click(self, selector): self.find(selector).click() return self def fill(self, selector, value): if isinstance(value, datetime.date): self.fill(selector + ' input.day', str(value.day)) self.fill(selector + ' input.month', str(value.month)) self.fill(selector + ' input.year', str(value.year)) else: control = self.find(selector) try: control.clear() except selexc.InvalidElementStateException: # On doit tenter de nettoyer un input[type=file]. On zap. pass control.send_keys(value) return self def select(self, selector, value): Select(self.find(selector)).select_by_value(value) return self def submit(self, selector='#document button[type=submit]'): return self.click(selector) def close(self): self.driver.close() if self.driver.window_handles: self.driver.switch_to.window(self.driver.window_handles[0]) self.driver.set_window_size(1120, 550) return self def screenshot(self, filename): self.driver.get_screenshot_as_file(filename) sys.stderr.write("Capture d'écran enregistrée dans %r\n" % (filename,)) return self def save(self, filename): with open(filename, 'w') as fo: fo.write(self.driver.page_source) sys.stderr.write("HTML enregistré dans %r\n" % (filename,)) return self def __getattr__(self, name): return getattr(self.driver, name)
pass outputfile = open(cityName + '-instagram-output.csv', 'a', 0) print colorama.Back.RED+colorama.Fore.YELLOW+str(len(setUrlDefined))+' URLs already defined! Lets Rock more now...'+colorama.Back.RESET+colorama.Fore.RESET driver = PhantomJS('./phantomjs') # in case of PhantomJS not available, we can use Firefox for line in tqdm(inputfile, total=numLines, desc='Crawling Instagram', leave=True): try: idtweet, url = line.replace('\n', '').split(',') if idtweet in setUrlDefined: continue except IndexError: print colorama.Fore.RED, 'Corrupted Line', colorama.Fore.RESET continue try: driver.get(url) placetag = driver.find_element_by_class_name('_kul9p') placeurl = placetag.get_attribute('href').encode('utf-8') placename = placetag.get_attribute('title').encode('utf-8') usernametag = driver.find_element_by_class_name('_4zhc5') username = usernametag.get_attribute('title').encode('utf-8') except selenium.common.exceptions.NoSuchElementException: try: error = driver.find_element_by_class_name('error-container') print colorama.Fore.RED, 'Sample Not Available Anymore', colorama.Fore.RESET outputfile.write(idtweet + ',' + url + ',404\n') continue except selenium.common.exceptions.NoSuchElementException: print colorama.Fore.RED, 'No Coords Available', colorama.Fore.RESET
class ProviderAdvancedViewTests(LiveServerTestCase): def setUp(self): self.driver = PhantomJS() self.user = User.objects.create_user('admin', '*****@*****.**', 'password') self.user.save() self.provider = Provider( name='provider', user=self.user, ) self.provider.save() self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider) self.login() def tearDown(self): self.driver.quit() def open(self, url): self.driver.get("%s%s" % (self.live_server_url, url)) def login(self): self.open(settings.LOGIN_URL) self.driver.find_element_by_id("id_username").send_keys("admin") self.driver.find_element_by_id("id_password").send_keys("password") self.driver.find_element_by_css_selector("button.btn.btn-default").click() self.assertEqual( self.driver.current_url, self.live_server_url + reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]), ) def test_can_login(self): """ Test that the user can login """ pass def test_provider_page_has_all_data(self): """ Test that the provider statistics page has all the correct data """ self.open(reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk])) self.assertEqual("Open Ads", self.driver.title) self.assertIn( "{0} advertisements".format(self.provider.name), self.driver.find_element_by_css_selector("h1.page-header").text ) self.assertIn( "{0} advertisements in rotation".format(20), self.driver.find_element_by_css_selector("h1.page-header").text ) def test_advertisement_page_has_all_data(self): """ Test that the advertisement page has all the correct data """ for advert in self.provider_adverts: self.open(reverse('advertisements.views.view_advert_statistics', args=[advert.pk])) self.assertIn( "ID number: {0}".format(advert.pk), self.driver.find_element_by_css_selector("h1.page-header").text, ) self.driver.find_element_by_css_selector("img") self.assertEqual("Active", self.driver.find_element_by_xpath("//td[2]/span").text) self.assertEqual(advert.url, self.driver.find_element_by_link_text(advert.url).text) self.driver.find_element_by_link_text("Edit URL").click() self.assertEqual(advert.url, self.driver.find_element_by_id("id_url").get_attribute("value"))
class WeixinPhantomjs(Base): def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() self.client = MongoClient(HOST, PORT) self.collection = self.client[DB][COLLECTION] self.all_uids = self.uids def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def get_query_words(self, word): query_words = [] for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]): w = docs['conp'] if w not in query_words: query_words.append(w) for item in docs['rel']: if item not in query_words: query_words.append(item) self.client.close() return self.query_index(query_words, word) @property def uids(self): return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.all_uids: self.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @staticmethod def query_index(words, cut_word): temp_words = words[START_INDEX:END_INDEX] try: index = temp_words.index(cut_word) return temp_words[index:], index + START_INDEX except ValueError: pass return temp_words, START_INDEX @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl(self, word=None, go=0): is_go = True is_break = False go_page = int(go) next_page_css = 'sogou_page_%s' query_words, ind = self.get_query_words(word) for index, word in enumerate(query_words, 1): next_ind = ind + index is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() wt = randint(10, 40) if page % 3 == 0 else randint(5, 18) self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) if is_break: break in_client.close() self.close_browser() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
class PagesCrawler(BaseSpider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [ url_to_lru_clean( "http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's'] ] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args[ 'phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int( args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int( args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int( args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error) def start_requests(self): self.log( "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO) self.log("ARGUMENTS : " + str(self.args), log.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID']) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities[ 'phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS(executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def crashed(self, spider): self.errors += 1 self.closed("CRASH") def closed(self, reason): if self.errors: self.log( "%s error%s encountered during the crawl." % (self.errors, 's' if self.errors > 1 else ''), log.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open( os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open( os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log( "Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log( "Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images( response.body), flags=flags, request=response.request) self.log( "WARNING: page with base64 embedded images was cleaned-up for links extraction" ) except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) error = failure.getErrorMessage() self.log("ERROR : %s" % error, log.ERROR) if PROXY and not PROXY.startswith( ':') and "OpenSSL.SSL.Error" in error: return self._request(failure.request.url, noproxy=True) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log( "ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
def get_ratio_data(): import socket import re import time import dryscrape import webkit_server from random import randint from fake_useragent import UserAgent from bs4 import BeautifulSoup from selenium.webdriver import PhantomJS from app.models import Company, Indicators from app.utils import cash_to_float, depercentize # Dict item with list: element attribute, attribute value to look for, optional transform function indicators = {'roe': { 'attribute': 'data-reactid', 'value': re.compile(".*RETURN_ON_EQUITY\.1$"), 'transform': depercentize, }, 'fcf': { 'attribute': 'data-reactid', 'value': re.compile(".*LEVERED_FREE_CASH_FLOW\.1$"), 'transform': cash_to_float, }, } ua = UserAgent() #with open("10.csv", "r") as f: #with open("sp500-2.csv", "r") as f: with open("10_stocks", "r") as f: data = f.read() symbols = [] for i in data.split("\n"): if i: symbols.append(i.split(",")[0]) print("Iterate through symbols") ## dryscrape #session = dryscrape.Session() #session.set_header('User-Agent', ua.random) #session.set_timeout(5) for symbol in symbols: print("{} Fetching {} :".format(time.strftime("%H:%M:%S"), symbol)) import pdb; pdb.set_trace() #driver = MyWebDriver() driver = PhantomJS() driver.set_window_size(1120, 550) driver.get("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) ##try: ## session = dryscrape.Session() ##except socket.error as e: ## print("Failed to configure session {}".format(e)) ## continue ##session.set_header('User-Agent', ua.random) ##session.set_timeout(30) #try: # #session.visit("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) #except Exception as e: # print e, "try once more......" # session.reset() # time.sleep(5) # session = dryscrape.Session() # #session.set_header('User-Agent', ua.random) # try: # session.set_timeout(5) # session.visit("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) # except Exception as e: # print e, "done trying..." # session.reset() # time.sleep(2) # session = dryscrape.Session() # continue #except socket.error as e: # print("Failed to get {}, {} (1)".format(symbol, e)) # continue #except webkit_server.EndOfStreamError as e: # print("Failed to get {}, {}, breaking (2)".format(symbol, e)) # continue #except webkit_server.InvalidResponseError as e: # print("Failed to get {}, {}, breaking (3)".format(symbol, e)) # continue #response = session.body() #soup = BeautifulSoup(response, "lxml") with open("{}.out".format(symbol), "w") as f: f.write(driver.page_source.encode('utf-8')) soup = BeautifulSoup(driver.page_source, "lxml") d = {'symbol': symbol} for indicator in indicators.keys(): curr_ind = indicators[indicator] s = soup.find_all(attrs={curr_ind['attribute']: curr_ind['value']}) print indicator, s for element in s: if curr_ind.has_key('transform'): f = curr_ind['transform'] #print(f(element.text)) d[indicator] = f(element.text) else: #print(element.text) d[indicator] = element.text try: db.session.add(Indicators.from_json(d)) db.session.commit() except (IntegrityError, UnmappedInstanceError) as e: print "Caught", e db.session.rollback() print "indicators", d
def selenium_opener(url): driver=PhantomJS(executable_path = 'phantomjs的路径') driver.get(url) hrml=driver.page_source driver.quit() return html
def main(): os.makedirs(dlDir, exist_ok=True) startCatIdx = int(sys.argv[1]) if len(sys.argv) > 1 else 0 startFamIdx = int(sys.argv[2]) if len(sys.argv) > 2 else 0 startPrdIdx = int(sys.argv[3]) if len(sys.argv) > 3 else 0 executor = ThreadPoolExecutor() PhantomJS.waitClickable = waitClickable driver = PhantomJS() # harvest_utils.driver = driver with open('netgear_filelist.csv', 'w') as fout: cw = csv.writer(fout) cw.writerow([ 'model', 'fw_ver', 'fileName', 'fw_url', 'fw_date', 'fileSize', 'sha1', 'md5' ]) driver.get('http://downloadcenter.netgear.com/') # click DrillDown driver.waitClickable( '#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch' ).click() # noqa ctl00 = "#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_adsPanel_" # noqa ignore=E501 # # wait Page2 try: catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory")) numCat = len(catSel.options) for catIdx in range(startCatIdx, numCat): catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory")) print('catIdx=', catIdx) catTxt = catSel.options[catIdx].text uprint('catTxt= ' + catTxt) oldText = driver.getText(ctl00 + "lbProductFamily") catSel.select_by_index(catIdx) driver.waitTextChanged(ctl00 + "lbProductFamily", oldText) famSel = Select(driver.waitClickable(ctl00 + "lbProductFamily")) numFam = len(famSel.options) for famIdx in range(startFamIdx, numFam): famSel = Select( driver.waitClickable(ctl00 + "lbProductFamily")) # noqa print('famIdx=', famIdx) startFamIdx = 0 famTxt = famSel.options[famIdx].text uprint('famTxt= ' + famTxt) oldText = driver.getText(ctl00 + "lbProduct") famSel.select_by_index(famIdx) driver.waitTextChanged(ctl00 + "lbProduct", oldText) prdSel = Select(driver.waitClickable(ctl00 + "lbProduct")) numPrd = len(prdSel.options) for prdIdx in range(startPrdIdx, numPrd): prdSel = Select(driver.waitClickable(ctl00 + "lbProduct")) startPrdIdx = 0 print("catIdx,famIdx,prdIdx=%d, %d, %d" % (catIdx, famIdx, prdIdx)) prdTxt = prdSel.options[prdIdx].text uprint('cat,fam,prd="%s","%s","%s"' % (catTxt, famTxt, prdTxt)) # noqa ignore=E501 prdWaiting = driver.waitElem( ctl00 + "upProgProductLoader > div > img") # noqa ignore=E501 prdSel.select_by_index(prdIdx) try: WebDriverWait(driver, 1, 0.5).\ until(lambda x: prdWaiting.is_displayed() is True) except TimeoutException: pass try: WebDriverWait(driver, 5, 0.5).\ until(lambda x: prdWaiting.is_displayed() is False) except TimeoutException as ex: pass numResults = driver.waitText( ctl00 + "lvwAllDownload_lblAllDownloadResult", 3, 0.5) # noqa ignore=E501 if numResults is None: continue numResults = int(re.search(r"\d+", numResults).group(0)) print('numResults=', numResults) if numResults > 10: driver.waitClickable("#lnkAllDownloadMore", 3).click() try: erItems = driver.getElems( 'a.register-product.navlistsearch', 3, 0.5) # noqa except TimeoutException: erItems = driver.getElems( 'div#LargeFirmware > ul > li > div > p > a.navlistsearch', 3) # noqa ignore=E501 if len(erItems) != numResults: print('Error, numResults=%d, but len(erItems)=%d' % (numResults, len(erItems))) for itemIdx, erItem in enumerate(erItems): if not erItem.is_displayed(): print('itemIdx=%d is not displayed()' % itemIdx) continue erItem.getItemText = getItemText desc = erItem.getElemText(erItem) uprint('desc="%s"' % desc) if 'firmware' not in desc.lower(): continue fw_url = erItem.get_attribute('data-durl') if not fw_url: fw_url = erItem.get_attribute('fw_url') print('fw_url=', fw_url) if not fw_url: continue if not fw_url.startswith('http'): print('Error: fw_url=', fw_url) continue executor.submit(download_file, prdTxt, desc, fw_url) except BaseException as ex: traceback.print_exc() import pdb pdb.set_trace() driver.save_screenshot("netgear_crawler2") finally: driver.quit() executor.shutdown(True)
class plugin: def __init__(self): APP_ROOT = os.path.dirname(os.path.abspath(__file__)) print(APP_ROOT) self.req = 0 self.driver = PhantomJS(APP_ROOT + "/phantomjs", service_log_path=os.path.devnull) self.driver.implicitly_wait(3) def restart(self): self.__init__() def frame_search(self, path): framedict = {} for child_frame in self.driver.find_elements_by_tag_name('frame'): child_frame_name = child_frame.get_attribute('name') framedict[child_frame_name] = {'framepath': path, 'children': {}} xpath = '//frame[@name="{}"]'.format(child_frame_name) self.driver.switch_to.frame( self.driver.find_element_by_xpath(xpath)) framedict[child_frame_name]['children'] = self.frame_search( framedict[child_frame_name]['framepath'] + [child_frame_name]) self.driver.switch_to.default_content() if len(framedict[child_frame_name]['framepath']) > 0: for parent in framedict[child_frame_name]['framepath']: parent_xpath = '//frame[@name="{}"]'.format(parent) self.driver.switch_to.frame( self.driver.find_element_by_xpath(parent_xpath)) return framedict def tmon(self): self.driver.get( "https://login.ticketmonster.co.kr/user/loginform?return_url=") self.driver.find_element_by_name('userid').send_keys( config['ACCOUNT']['tmon_id']) self.driver.find_element_by_name('password').send_keys( config['ACCOUNT']['tmon_pw']) self.driver.find_element_by_xpath('//*[@id="loginFrm"]/a[2]').click() self.driver.get( 'http://m.benefit.ticketmonster.co.kr/promotions/page/attendance?view_mode=app' ) self.driver.find_element_by_xpath( '//*[@id="attn_wrap"]/div/div/div[3]/div[2]/div[1]/button').click( ) print(self.driver.find_element_by_class_name('content').text) self.tmon_ret = self.driver.find_element_by_class_name('content').text def ondisk(self): try: self.driver.get("http://ondisk.co.kr/index.php") self.driver.implicitly_wait(3) self.driver.find_element_by_xpath('//*[@id="mb_id"]').send_keys( config['ACCOUNT']['ondisk_id']) self.driver.find_element_by_xpath( '//*[@id="page-login"]/form/div[2]/p[2]/input').send_keys( config['ACCOUNT']['ondisk_pw']) self.driver.find_element_by_xpath( '//*[@id="page-login"]/form/div[2]/p[3]/input').click() self.driver.get( "http://ondisk.co.kr/index.php?mode=eventMarge&sm=event&action=view&idx=746&event_page=1" ) self.driver.switch_to_frame(1) self.driver.execute_script( "window.alert = function(msg){ window.msg = msg; };") self.driver.find_element_by_class_name('button').click() alert_text = self.driver.execute_script("return window.msg;") print(alert_text) except: print("ERR") print(self.driver.page_source) self.ondisk_ret = alert_text def ok_cash_bag(self): today = datetime.datetime.now().strftime("%Y%m%d") sess = requests.session() getdata = sess.get( "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101" ) param = { "lsd": "AVpmy4vJ", "api_key": "645711852239977", "cancel_url": "https://member.okcashbag.com/ocb/socialId/facebookProcessor?error=access_denied&error_code=200&error_description=Permissions+error&error_reason=user_denied#_=_", "display": "page", "enable_profile_selector": "", "isprivate": "", "legacy_return": "0", "profile_selector_ids": "", "return_session": "", "skip_api_login": "******", "signed_next": "1", "trynum": "1", "timezone": "-540", "lgndim": "eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNDAsImMiOjI0fQ==", "lgnrnd": "173648_UqkK", "lgnjs": "1528418208", "email": config['ACCOUNT']['fb_id'], "pass": config['ACCOUNT']['fb_pw'], "prefill_contact_point": config['ACCOUNT']['fb_id'], "prefill_source": "last_login", "prefill_type": "contact_point", "first_prefill_source": "last_login", "first_prefill_type": "contact_point", "had_cp_prefilled": "true", "had_password_prefilled": "false" } postdata = sess.post( "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101", data=param) # print(postdata.text) postdata = sess.post( "https://member.okcashbag.com//ocb/socialId/socialIdLoginProcess/42100/687474703A2F2F7777772e6f6b636173686261672e636f6d2F696e6465782e646f3F6c6f67696e3D59" ) samlResponse = postdata.text.split("samlResponse.value = \"")[1].split( "\"")[0] # print(samlResponse) param = {"samlResponse": samlResponse, "sst_cd": "", "return_url": ""} postdata = sess.post("http://www.okcashbag.com/index.do?login=Y", data=param) print( postdata.text.split('<span id="profileNickname" class="name">') [1].split("</span>")[0] + "님 로그인") print( postdata.text.split('<span id="spanUsablePoint">')[1].split( '</span>')[0] + "포인트") getdata = sess.get( "http://www.okcashbag.com/life/event/attend/attendMain.do") param = {"method": "", "myUrl": "", "recommUser": "", "today": today} postdata = sess.post( "http://www.okcashbag.com/life/event/attend/attend.do", data=param) print(postdata.text) if len(postdata.text.split('<i class="win-point">')) > 1: print(postdata.text.split('<i class="win-point">')[1] + "포인트 적립") elif len(postdata.text.split("success")) > 1: print("출석체크 완료 ") self.ok_ret = "출석체크 완료" else: print('이미 출석체크 완료') self.ok_ret = "이미 출석체크 완료"
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
class WeixinPhantomjs(Base): all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.__class__.all_uids: self.__class__.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl_single(self, word=None, go=0): is_go = True go_page = int(go) next_page_css = 'sogou_page_%s' is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18) wt = randint(1, 5) self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) self.close_browser() @classmethod def crawl_with_threads(cls): pool = ThreadPool(4) total_words = QueryWords().get_query_words() for bulk_words in total_words: try: pool.map(lambda w: cls().crawl_single(w), bulk_words) except Exception as e: cls.logger.info('Threads crawl error: type <{}>, msg <{}>'.format(e.__class__, e)) pool.close() pool.join() in_client.close() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
from selenium.webdriver import PhantomJS as Browser import json import time import re proxy_list_url = "http://spys.one/socks/" proxies = [] br = Browser() br.get(proxy_list_url) sizes = [25, 50, 100, 200, 300, 500] pattern = re.compile(r"[.\s]+\((\d+)\)") for country_id in range(1, 171): try_counter = 0 count = 0 while (elm := br.find_element_by_id('tldc')).find_element_by_xpath( f"./option[@selected]").get_attribute("value") != str(country_id): elm = elm.find_element_by_xpath(f'./option[@value="{country_id}"]') elm.click() try_counter += 1 if try_counter >= 2: break if try_counter >= 2: continue count = int(pattern.findall(elm.text)[0]) key = 0 for key, size in enumerate(sizes): if int(size) > count: break try_counter = 0 while (elm := br.find_element_by_id("xpp")).find_element_by_xpath( "./option[@selected]").get_attribute("value") != str(key):
def get_applications_in_page(self, scroll_script): applications = [] driver = None try: desired_capabilities = dict(DesiredCapabilities.PHANTOMJS) desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url) service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))] driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args) # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy) if self.proxy_test: driver.get('http://curlmyip.com/') ip = driver.find_element_by_xpath('//body//pre').text print('ip : [ ' + ip + ' ]') pass else: driver.get(self.url) driver.execute_script(scroll_script) acknowledge = 0 done = False while not done: scroll_finished = driver.execute_script("return scraperLoadCompleted") if scroll_finished: if acknowledge == self.acknowledgements: done = driver.execute_script("return scraperLoadCompleted") pass else: acknowledge += 1 pass pass else: acknowledge = 0 pass time.sleep(5) # Wait before retry pass product_matrix = driver.find_elements_by_class_name("card") for application in product_matrix: extracted_application = self.extract_application_data(application) # if extracted_application['app_price'] != -1: applications.append(extracted_application) #pass pass pass driver.quit() pass except Exception as e: if driver is not None: driver.quit() pass if self.attempt < self.retries: self.attempt += 1 time.sleep(10) print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]' applications = self.get_applications_in_page(scroll_script) pass else: print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]') pass pass return applications pass
class RequestUtil: __browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0' def __init__(self): self.cookies = '' self._lock = threading.RLock() def http_get_request(self, url, referer, timeout=''): self._lock.acquire() cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie), SmartRedirectHandler()) urllib2.install_opener(opener) headers = { 'User-Agent': self.__browserAgent, 'Referer': referer, 'Cache-Control': 'max-age=0', 'Accept': '*/*', 'Connection': 'Keep-Alive', 'Accept-encoding': 'gzip' } req = urllib2.Request(url=url, headers=headers) if timeout == '': open = urllib2.urlopen(req) else: open = urllib2.urlopen(req, timeout=timeout) if self.cookies == '': for item in cookie: self.cookies = self.cookies + item.name + '=' + item.value + ';' self.cookies = self.cookies[:-1] if url != open.url: req = urllib2.Request(url=open.url, headers=headers) self._lock.release() return (open, req) def http_post_request(self, url, datas, referer, timeout=''): self._lock.acquire() postdata = urllib.urlencode(datas) headers = { 'User-Agent': self.__browserAgent, 'Referer': referer, 'Content-Type': 'application/x-www-form-urlencoded', 'Cache-Control': 'no-cache', 'Accept': '*/*', 'Connection': 'Keep-Alive', 'Accept-encoding': 'gzip', 'Cookie': self.cookies } req = urllib2.Request(url=url, data=postdata, headers=headers) req.get_host() if timeout == '': open = urllib2.urlopen(req) else: open = urllib2.urlopen(req, timeout=timeout) if url != open.url: req = urllib2.Request(url=open.url, headers=headers) self._lock.release() return (open, req) def http_get(self, url, refer='https://www.baidu.com'): return self.http_get_request(url, refer, 60) def http_post(self, url, datas, refer='https://www.baidu.com'): return self.http_post_request(url, datas, refer, 60) def http_post_request2(self, url, datas, timeout=''): if timeout == '': open = urllib2.urlopen(url, datas) else: open = urllib2.urlopen(url, datas, timeout=timeout) data = open.read() return data def http_post2(self, url, datas): return self.http_post_request2(url, datas, 300) def create_phandomjs(self, service_args, caps, timeout=30): self.driver = PhantomJS(desired_capabilities=caps, service_args=service_args) self.driver.set_page_load_timeout(timeout) self.driver.set_script_timeout(timeout) self.driver.implicitly_wait(timeout) def close_phandomjs(self): try: self.driver.quit() except: pass def http_get_phandomjs(self, url, refer='https://www.baidu.com', timeout=1000): caps = dict(DesiredCapabilities.PHANTOMJS) caps['browserName'] = 'chrome' caps["phantomjs.page.settings.resourceTimeout"] = timeout caps["phantomjs.page.settings.loadImages"] = False caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent) caps["phantomjs.page.customHeaders.Referer"] = (refer) service_args = [] service_args.append('--load-images=no') service_args.append('--disk-cache=yes') service_args.append('--cookies-file=') self.create_phandomjs(timeout=timeout, service_args=service_args, caps=caps) self.driver.get(url) return self.driver.page_source
class SaveFromScraper(object): """ Web scraper for savefrom.net """ def __init__(self): """ Default constructor ARGS: None RETURNS: None """ self.browser = PhantomJS(executable_path='./drivers/phantomjs', port=free_port()) # Optional argument, if not specified will search path. self.timeout = 5 # seconds def get_video(self, video_id, url, quality): """ Main function that does heavy lifting Select video quality in this function. """ pass def _get_html(self, video_id): """ For a given video, returns the HTML for the download page ARGS: video_id: unique video identifier RETURNS: Tuple: whether page has loaded, and HTML for page """ url = 'https://www.ssyoutube.com/watch?v=' + video_id ## TODO: remove - this is only for testing self.browser.get(url) try: class_name = "link-download" WebDriverWait(self.browser, self.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, 'link-download'))) has_loaded = True print "Page is ready!" except TimeoutException: has_loaded = False print "Loading took too much time!" if has_loaded: html = self.browser.page_source else: html = "" return (has_loaded, html) def _parse_html(self, html): """ Find the links for downloading the video in the HTML ARGS: html: web page source RETURNS: Dictionary containing all links for downloading the video """ link_dict = dict() soup = BeautifulSoup(html, 'html.parser') for link in soup.findAll("a", { "class" : "link-download" }): if 'title' in link.attrs: title = link.attrs['title'].split(': ')[1] url = link.attrs['href'] link_dict[title] = url else: pass return link_dict def _download_file(self, video_id, download_url): """ Download video file from SaveFromNet ARGS: video_id: unique video identifier RETURNS: None """ f = urllib2.urlopen(download_url) with open(video_id + ".mp4", "wb") as code: code.write(f.read()) def quit(self): """ Quit browser session ARGS: None RETURNS: None """ browser.close() browser.quit()