class Parser(object): def __init__(self): self.browser = PhantomJS() def cleanup(self): self.browser.quit()
def selenium(self, webdriverOption=0): """ # 调用浏览器下载,适用于任何情形 :return: """ if not self.url[:4] == "http": return None driver = None if webdriverOption == 0: from selenium.webdriver import PhantomJS driver = PhantomJS() elif webdriverOption == 1: from selenium.webdriver import Chrome driver = Chrome() elif webdriverOption == 2: from selenium.webdriver import Firefox driver = Firefox() if not driver: print(u"-->DownLoader->Selenium driver初始化出错,请检查运行环境或webdriverOption选项") driver.get(self.url) src = driver.page_source driver.quit() self.pageSource = src return src
def main(): steam_id, api, return_amount, user_categories = read_config_values() print("SteamID:", steam_id) print("API key:", api) print("Return amount:", return_amount) if len(user_categories): # > 0 check_user_categories_validity(user_categories) print("Categories:", "; ".join(user_categories)) print() print("Fetching your Steam library..") user_library = fetch_user_library(api, steam_id) print("Found {} in your library.".format(len(user_library))) print("Opening PhantomJS..") driver = PhantomJS(cwd + r"\dependencies\phantomJS\phantomjs.exe", service_log_path=cwd + r"\dependencies\phantomJS\ghostdriver.log") print("Opening SteamDB..") output = fetch_sales(driver, user_library, return_amount, user_categories) driver.quit() with open("games.txt", 'w', encoding='utf-8') as file: file.write(output) input("\nDone. I also wrote the games to a text file.")
class SeleniumTestCase(LiveServerTestCase): def _pre_setup(self): super(SeleniumTestCase, self)._pre_setup() self.driver = PhantomJS() def _post_teardown(self): self.driver.quit() super(SeleniumTestCase, self)._post_teardown() def login(self, username='******', password='******', url='login'): """ Login to the server and be authenticated """ self.open(reverse(url)) self.driver.find_element_by_id("id_username").clear() self.driver.find_element_by_id("id_username").send_keys(username) self.driver.find_element_by_id("id_password").clear() self.driver.find_element_by_id("id_password").send_keys(password) self.driver.find_element_by_id("submit-id-login").click() def open(self, url): self.driver.get("%s%s" %(self.live_server_url, url)) def is_element_present(self, how, what): try: self.driver.find_element(by=how, value=what) except NoSuchElementException, e: return False return True
class Translator(threading.Thread): def __init__(self, queue, executable_path=None, desired_capabilities=None, service_args=None, google_translate_url=config['google_translate_url'], window_size=config['window_size']): super(self.__class__, self).__init__() self._queue = queue kwargs = {} if executable_path is not None: kwargs['executable_path'] = executable_path if desired_capabilities is not None: kwargs['desired_capabilities'] = desired_capabilities if service_args is not None: kwargs['service_args'] = service_args self._driver = PhantomJS(**kwargs) self._driver.set_window_size(*window_size) self._driver.get(google_translate_url) def run(self): while True: task = self._queue.get() if task is None: self._queue.task_done() self._driver.quit() break task.do(self._driver) self._queue.task_done()
def main(): global HEAD if len(sys.argv) > 1: try: HEAD = int(sys.argv[1]) except: HEAD = 10 # test mirror list mirror_list = read_mirrors() for i in mirror_list: try: cururl = i print("Testing:",i) res = request.urlopen(i) except: print("Testing on",i,"failed") continue try: update_mirrors(cururl) break; except: continue; try: res except: raise Warning('All mirrors unavailable!') print('Available mirror:',cururl) # get vpn table countries = dict() dr = PhantomJS() dr.get(cururl) page = Selector(text=dr.page_source)\ .xpath('.//td[@id="vpngate_inner_contents_td"]/' 'table[@id="vg_hosts_table_id"]//tr') if HEAD < len(page): page = page[:HEAD] print('Pagelen:',len(page)) for vpn in page: if len(vpn.xpath('./td[@class="vg_table_header"]')) > 0: continue row = vpn.xpath('./td') country = row[0].xpath('./text()').extract_first() country = '_'.join(country.split(' ')) ovpn = row[6].xpath('./a/@href').extract_first() if ovpn: if country in countries: countries[country] += 1 get_ovpn(url=cururl+ovpn, save_to=country+'/'+str(countries[country])) else: countries[country] = 0 if not os.path.exists(country): os.mkdir(country) get_ovpn(url=cururl+ovpn, save_to=country+'/'+str(countries[country])) dr.quit()
class Crawler: def __init__(self, timeout=20, phantomjs_cfg_file='python-utils/config/phantomjs_cfg.json', use_cfg_file=False, proxy_pool_server='http://127.0.0.1:15110'): self.timeout = timeout if use_cfg_file: phantomjs_service_args = ['--config={}'.format(phantomjs_cfg_file)] else: _, proxy_type, proxy, proxy_auth = get_proxy(proxy_pool_server) phantomjs_service_args = [ '--proxy-type={}'.format(proxy_type), '--proxy={}'.format(proxy), '--proxy-auth={}'.format(proxy_auth), ] self.driver = PhantomJS( desired_capabilities=self.new_desired_capabilities(), service_args=phantomjs_service_args) self.check_client_info() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def close(self): self.driver.quit() @contextmanager def wait_for_page_load(self, old_element): yield WebDriverWait(self.driver, self.timeout).until(EC.staleness_of(old_element)) def new_desired_capabilities(self, user_agent=default_ua): desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() if not user_agent: user_agent = ua.random desired_capabilities["phantomjs.page.settings.userAgent"] = user_agent return desired_capabilities def check_client_info(self): url='http://www.whoishostingthis.com/tools/user-agent/' self.driver.get(url) ip_addr = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[2]/span').text.strip() user_agent = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[1]').text.strip() logger.info('IP: {}, User-Agent: {}'.format(ip_addr, user_agent)) if self.wrong_ip(ip_addr): logger.error('Proxy not set correctly!') sys.exit(-1) def wrong_ip(self, ip_addr): if ip_addr.startswith('166.111.') or ip_addr.startswith('59.66.') or ip_addr.startswith('101.5.') or ip_addr.startswith('101.6.'): return True else: return False
def main(): driver = PhantomJS() scraper = NLScraper(driver, year=2014) print(sys.argv[1]) writer = unicodecsv.DictWriter(open(sys.argv[1], 'w'), ('amount', 'scheme', 'year', 'country', 'currency', 'recipient_name', 'recipient_postcode', 'recipient_id', 'recipient_location')) writer.writeheader() try: scraper.start(writer) finally: driver.quit()
def onegoogolePR(self, url): '''返回单个PR''' prUrl = 'http://pr.chinaz.com' # 谷歌PR查询地址 driver = PhantomJS() driver.get(prUrl) driver.find_element_by_id('PRAddress').send_keys(url) driver.find_element_by_class_name('search-write-btn').click() try: imgsrc = driver.find_element_by_css_selector('span#pr>img').get_attribute('src') pr = search(r'\d', imgsrc).group() except: pr = '暂无数据' driver.quit() return pr
def catalog_url(url='http://www.meitun.com/'): # catalog_url is AJAX,use phantomJS driver = PhantomJS() driver.get(url) driver.maximize_window() mov_ele = driver.find_element_by_css_selector('.nav>ul>li:nth-child(1)') # the mouse move to the lazy layout element,and perform ActionChains(driver).move_to_element(mov_ele).perform() time.sleep(3) response = driver.page_source driver.quit() # use pyquery parser the page source,more quickly d = pq(response) return map(lambda x: 'http:' + pq(x).attr('href'), d.find('.cg-pdts a'))
class AdvertisementAdvancedViewTests(LiveServerTestCase): def setUp(self): self.driver = PhantomJS() self.user = User.objects.create_user('admin', '*****@*****.**', 'pass') self.user.save() self.provider = Provider( name='provider', user=self.user, ) self.provider.save() self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider) def tearDown(self): self.driver.quit() def open(self, url): self.driver.get("%s%s" % (self.live_server_url, url)) def test_side_ad_display(self): """ Test that the side ads display properly """ self.open(reverse('advertisements.views.side_ads')) self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 4) self.driver.find_element_by_xpath("//a[1]/img") self.driver.find_element_by_xpath("//a[2]/img") self.driver.find_element_by_xpath("//a[3]/img") self.driver.find_element_by_xpath("//a[4]/img") self.assertNotEqual(self.driver.find_element_by_xpath("//a[1]").get_attribute("href"), '') self.assertNotEqual(self.driver.find_element_by_xpath("//a[2]").get_attribute("href"), '') self.assertNotEqual(self.driver.find_element_by_xpath("//a[3]").get_attribute("href"), '') self.assertNotEqual(self.driver.find_element_by_xpath("//a[4]").get_attribute("href"), '') def test_top_ad_display(self): """ Test that the top ad displays properly """ self.open(reverse('advertisements.views.top_ad')) self.assertEqual(len(self.driver.find_elements_by_xpath("//a")), 1) self.driver.find_element_by_xpath("//a/img") self.assertNotEqual(self.driver.find_element_by_xpath("//a").get_attribute("href"), '')
def on_start_again(self, url): driver = PhantomJS() driver.get(url) time.sleep(2) driver.maximize_window() t = driver.find_element_by_css_selector('.page-txt').text res_t = [] if t: t = int(t.split('/')[1][:-1]) - 1 # get the page count # the count of page turning should be i-1 while t: t -= 1 move_ele = driver.find_element_by_css_selector('#next') ActionChains(driver).move_to_element(move_ele).click() time.sleep(1) res_t.append(driver.page_source) driver.quit() for item in res_t: self.step_first(item)
class Premiumgeneratorlink(object): def __init__(self, url): self.url = url self.browser = PhantomJS() def get_link(self): try: self.browser.get('http://premiumgeneratorlink.com/') self.browser.find_element_by_name('link').send_keys(self.url) self.browser.find_element_by_xpath('//a[@class="input"]').click() wdw = WebDriverWait(self.browser, 10) wdw.until(EC.element_to_be_clickable((By.ID, 'check'))).click() wdw.until(EC.element_to_be_clickable((By.ID, 'generate'))).click() link = wdw.until(EC.visibility_of_element_located((By.XPATH, '//form[@class="center"]'))).get_attribute('action') except (WebDriverException, NoSuchElementException, TimeoutException): return False finally: self.browser.quit() return link
def check_agree(link, soup): # Agree if asked to (click on accept) if soup.find('input', {'id': 'ctl00_mainContentArea_disclaimerContent_yesButton'}): print("Agreeing the terms of use - please wait...") driver = PhantomJS('.\phantomjs.exe' if platform. startswith('win32') else './phantomjs') driver.get(link) driver.find_element_by_id( 'ctl00_mainContentArea_disclaimerContent_yesButton').click() for cookie in driver.get_cookies(): s.cookies.set(cookie['name'], cookie['value']) driver.quit() resp_inner = s.get(link) soup = Soup(resp_inner.text, features="lxml") print("Done, now let's get back to the scraping process.") return soup
class Leecherus(object): def __init__(self, url): self.url = url self.browser = PhantomJS() def get_link(self): try: self.browser.get('http://leecher.us') wdw = WebDriverWait(self.browser, 10) wdw.until(EC.visibility_of_element_located((By.NAME, 'link'))).send_keys(self.url) wdw.until(EC.element_to_be_clickable((By.XPATH, '//button[@class="subscribe"]'))).click() wdw.until(EC.element_to_be_clickable((By.XPATH, '//input[@class="subscribe"]'))).click() self.browser.switch_to_window(self.browser.window_handles[1]) onclick = wdw.until(EC.element_to_be_clickable((By.ID, 'get_link'))).get_attribute('onclick') except (WebDriverException, NoSuchElementException, TimeoutException, IndexError): return False finally: self.browser.quit() m = re.search("'(http://[^']+)'", onclick) return m.group(1) if m else False
class PagesCrawler(BaseSpider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [ url_to_lru_clean( "http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's'] ] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args[ 'phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int( args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int( args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int( args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error) def start_requests(self): self.log( "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO) self.log("ARGUMENTS : " + str(self.args), log.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID']) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities[ 'phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS(executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def crashed(self, spider): self.errors += 1 self.closed("CRASH") def closed(self, reason): if self.errors: self.log( "%s error%s encountered during the crawl." % (self.errors, 's' if self.errors > 1 else ''), log.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open( os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open( os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log( "Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log( "Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images( response.body), flags=flags, request=response.request) self.log( "WARNING: page with base64 embedded images was cleaned-up for links extraction" ) except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) error = failure.getErrorMessage() self.log("ERROR : %s" % error, log.ERROR) if PROXY and not PROXY.startswith( ':') and "OpenSSL.SSL.Error" in error: return self._request(failure.request.url, noproxy=True) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log( "ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
def selenium_opener(url): driver=PhantomJS(executable_path = 'phantomjs的路径') driver.get(url) hrml=driver.page_source driver.quit() return html
def get_tickers(stock_list): list = pd.read_csv(stock_list) try: tickers = [c for c in list[list.columns[1]]] except Exception as e: print(e) sys.exit(1) #List for the stocks that had some error being collected error_stocks = [] # Creates directory for the stock data CSV files if not os.path.exists('stock_dfs'): os.makedirs('stock_dfs') global source # When data collecting will start and end for the Dates global start global end print(f'>>>Getting Stock Data from {source} from {end}') #Iterating through each ticker for ticker in tqdm.tqdm(tickers): # Reading data on the stock. If grabbing todays data failes, tries to grab data from yesterday try: df = web.DataReader(ticker, source, start, end) except: #Changing end date to yesterday end = (dt.datetime.now() - dt.timedelta(1)).strftime('%Y-%m-%d') df = web.DataReader(ticker, source, start, end) #High/Low Open/Close percentage df['HL_pct'] = ((df['High'] - df['Low']) / df['Low']) * 100 df['OC_pct'] = ((df['Close'] - df['Open']) / df['Open']) * 100 #Boolinger Band df['Middle Boolinger'] = df['Adj Close'].rolling(20).mean() df['Sup_Boolinger'] = df['Middle Boolinger'] + ( 2 * df['Adj Close'].rolling(20).std()) df['Inf_Boolinger'] = df['Middle Boolinger'] - ( 2 * df['Adj Close'].rolling(20).std()) #Exponential Moving Mean df['Exp20_Close'] = df['Adj Close'].ewm(span=20, adjust=False).mean() #Expantion/Contraction of stock price df['Deviation_band'] = df['Adj Close'].rolling(20).std() #RSI change = df['Adj Close'].diff(1) gain = change.mask(change < 0, 0) loss = change.mask(change > 0, 0) avg_gain = gain.ewm(min_periods=rsi_period, com=rsi_period - 1).mean() avg_loss = loss.ewm(min_periods=rsi_period, com=rsi_period - 1).mean() rs = abs(avg_gain / avg_loss) df['RSI'] = 100 - (100 / (1 + rs)) ''' Now the code will do a webscrape on some pages on yahoo finance to get more details and info. It will do this by table reading or span-string reading since some pages don't have tables. With table reading it's straight up but with span reading we need to get the reactID of each line we want. And for that it's kind of hardcoded, I read through all the span lines and wrote down the useful ones. ''' #Reading into page resp = requests.get( f'https://finance.yahoo.com/quote/{ticker}/financials') #BeautifulSoup scrapes the page in TXT form soup = bs.BeautifulSoup(resp.text, 'lxml') #Number of span lines we got length = int(np.array(soup.find_all('span')).shape[0]) #All lines with the span class, which has the info we want lines = np.array(soup.find_all('span')) #List to store the span lines that have the reactID codes we want spans = [] #Dates we want to find find_dates = ['12/30/2019', '12/30/2018', '12/30/2017', '12/30/2016'] #List for the dates we actually find dates = [] #Iterating through the lines and grabbing all lines from the span class for line in range(0, length): spans.append(BeautifulSoup(str(lines[line]), features='lxml').span) #Iterating through each date we want to find in the website for date in find_dates: #Iterating through each span-class line for line in range(0, length): #If the text line and date match then put the date in the found dates list if spans[line].string == date: dates.append(spans[line].string) break #Changes date format for indexing with the webreader dataframe for index, date in enumerate(dates): #If any string dpesn't match the format than it's not a date and will be removed try: dates[index] = dt.datetime.strptime( date, "%m/%d/%Y").strftime("%Y-%m-%d") except: #dates.remove will raise exception when there is no more of such content in the list, stopping the loop removed = False while (removed == False): try: dates.remove(dates[index]) except: removed = True #Adding 3 days to the dates, because most stocks don't opperate on the last day of the year. Which is #the date time for the data to appear on the website. for index, date in enumerate(dates): dates[index] = (dt.datetime.strptime(date, '%Y-%m-%d') + dt.timedelta(3)).strftime('%Y-%m-%d') #Info we want to get from the webiste interesting_lines = [ 'Total Revenue', 'Cost of Revenue', 'Gross Profit', 'Selling General and Administrative', 'Total Operating Expenses', 'Operating Income or Loss', 'Interest Expense', 'Total Other Income/Expenses Net', 'Income Before Tax', 'Income Tax Expense', 'Income from Continuing Operations', 'Net Income', 'Net Income available to common shareholders', 'EBITDA' ] #List for the info we actually find on the website infos = [] #List for the ReactIDs of the lines that have the data about the infos above number_ids = [] #Column renaming column_names = [ 'Total Revenue (TTM)', 'Cost of Revenue (TTM)', 'Gross Profit (TTM)', 'Selling General and Administrative Expenses (TTM)', 'Total Operating Expenses (TTM)', 'Operating Income or Loss (TTM)', 'Interest Expense (TTM)', 'Total Other Income/Expenses Net', 'Income Before Tax (TTM)', 'Income Tax Expense (TTM)', 'Income from Coninuing Operations (TTM)', 'Net Income (TTM)', 'Net Income available to Shareholders (TTM)', 'EBITDA (TTM)' ] #Iterating through the informations we want for index, info in enumerate(interesting_lines): #Boolean for if the information was found check = False #Iterating through the span lines for line in range(0, length): #If line contains the information we want, appends it to the found infos list. if spans[line].string == info: infos.append(spans[line].string) #Appends the info's reactID +5, one line below, where the numbers and data are number_ids.append( str(int(spans[line]['data-reactid']) + 5)) check = True pass #In case the information isn't found, the respective column name is changed to a NAN, to be removed later if check == False: column_names[index] = np.nan #Removing NANs from column name list column_names = [c for c in column_names if str(c) != 'nan'] #Creating the columns for the information for column in column_names: df[f'{column}'] = np.nan #Iterating through dates, with indexing for index, date in enumerate(dates): #Iterating through new columns, with indexing for column, string in enumerate(column_names): #Iterating through span lines for line in range(0, length): #Fetching data for the respective information column in order if spans[line]['data-reactid'] == number_ids[column]: #Locates the date in dataframe index, formats the string of the data, turns it into a Integer and #puts the data in it's correct place in time. try: df[f'{string}'].loc[dates[index]] = int( (spans[line].string).replace(',', '')) except Exception as e: print(e) print( f'Error formating/alocating string to int for stock {ticker}' ) #Appending to stocks with errors list error_stocks.append(ticker) continue #Adding 2 to the IDs for each iteration so we get the lines of previous dates for the information number_ids = [int(c) for c in number_ids] number_ids = [c + 2 for c in number_ids] number_ids = [str(c) for c in number_ids] #Page URL that we will pass to PhantomJS url = f'https://finance.yahoo.com/quote/{ticker}/key-statistics' #Initiating PhantomJS driver = PhantomJS(executable_path=r'phantomjs.exe') #Opening URL with PhantomJS to fully load the page driver.get(url) #Returning page source after all the JavaScript codes have been loaded resp = driver.page_source #Closing PhantomJS driver.quit() #List of tables that Pandas found in the web page dfs = pd.read_html(resp) #Dataframe to put all the tables in just one key_stats = pd.DataFrame() #Iterating through the tables for dframe in dfs: #If dataframe is empty, passes the first table if key_stats.empty: key_stats = dframe #If it already has a table, appends the new ones else: key_stats = key_stats.append(dframe) #Fixing dataframe index, with numbers from 0 to length of dataframe key_stats.index = [c for c in range(0, key_stats.shape[0])] #There´s some info that we don´t have interest so we drop what we don´t need stats = key_stats.loc[:8] #Removing columns 0 and 1 stats = stats.drop([0, 1], axis=1) #Passing the information names as the dataframe index stats.index = [c for c in stats['Unnamed: 0'].values] #Removing the column with information names, since it´s all in the index stats = stats.drop(['Unnamed: 0'], axis=1) #Transposing the dataframe, so that the Dates become the index and the information names become the column stats = stats.transpose() #Criating the new columns in the main dataframe for column in stats.columns: df[f'{column}'] = np.nan #Putting all the dates in a list dates = [c for c in stats.index] #Iterating through the dates for index, date in enumerate(dates): #Changing date format try: dates[index] = dt.datetime.strptime( date, "%m/%d/%Y").strftime("%Y-%m-%d") except: #One of the dates actually has more things than the date so we remove all that date = date.replace('As of Date: ', '') date = date.replace('Current', '') dates[index] = dt.datetime.strptime( date, "%m/%d/%Y").strftime("%Y-%m-%d") #Adding 3 days because stocks don´t opperate in the last day of the year for index, date in enumerate(dates): dates[index] = (dt.datetime.strptime(date, '%Y-%m-%d') + dt.timedelta(3)).strftime('%Y-%m-%d') #Passing changed dates back into the dataframe´s index stats.index = dates #Iterating through dates again for date in stats.index: #Iterating through the new columns for column in stats.columns: #Locating the dates and columns in the main dataframe and putting the respetive data in it´s place try: df[f'{column}'].loc[date] = stats[f'{column}'].loc[date] #If any errr occurs in this process, shows the error for the respective stock and adds it to the #stocks-with-error list except Exception as e: print(e) print( f'Error formating/alocating string to int for stock {ticker}' ) #Appending to stocks with errors list error_stocks.append(ticker) ''' Since we only have info year by year and the .loc funtion only puts the data in the specific index, we need to fill the NANs with the previous data that isn't a NAN (ffill method). This way, from each data alocated, all future lines will have this exact data, until a new data (the most recent) appears, and the process repeats. ''' df.fillna(method='ffill', inplace=True) # Saving csv file df.to_csv('stock_dfs/{}.csv'.format(ticker)) #Showing any stocks with errors if there are any if error_stocks != []: print('\n ------ Inspect Errors ------- \n') print([c for c in error_stocks])
class gmail(Thread): def __init__(self, account): name = account['name'] super().__init__(name=name) # Thread __init__ lg.warning('{0[name]}, proxy: {0[Proxy]}'.format(account)) self.account = account self.solved = 0 if 0: # Getting cookies snippet print(self.driver.get_cookies()) cookies = { _['name']: _['value'] for _ in self.driver.get_cookies() } with open('cookies.json', 'w') as f: dump(cookies, f, indent=4) def verify(self, el): '''Verifies the account. May be untrivial:(''' text = el.text # get_attribute('value') lg.info('Text: {}'.format(text)) if text == "Verify it's you": lg.debug('Verify') #el=self.driver.find_element_by_id('identifierNext') el = self.driver.find_element_by_xpath( '//div[.="Confirm your recovery email"]') print(el) el.click() el = WebDriverWait(self.driver, 3).until( EC.visibility_of_element_located( (By.NAME, 'knowledgePreregisteredEmailResponse'))) el.send_keys(account[2]) # recovery email def login(self): if 0: # to test #'https://www.whoishostingthis.com/tools/user-agent/' self.driver.get('about:about') sleep(1000) #self.driver.get('https://mail.google.com') self.driver.get( 'https://accounts.google.com/signin/v2/identifier?continue=https%3A%2F%2Fmail.google.com%2Fmail%2F&service=mail&sacu=1&rip=1&flowName=GlifWebSignIn&flowEntry=ServiceLogin' ) prefilled = False lg.debug('Logging in with {}'.format(self.account)) try: el = WebDriverWait(self.driver, 2).until( EC.visibility_of_element_located((By.ID, 'identifierId'))) except TimeoutException: prefilled = True if prefilled: lg.info('Username prefilled already') else: lg.debug('Entering username') el.send_keys(self.account['name']) # username nxt = self.driver.find_element_by_id('identifierNext') nxt.click() logged_in = False try: el = WebDriverWait(self.driver, 20).until( EC.visibility_of_element_located((By.NAME, 'password'))) except TimeoutException: # We're logged in? # TODO: Check for something visible after being logged in # Because we may genuinely be in timeout logged_in = True if logged_in: lg.info('Logged in already') else: lg.debug('Entering password') el.send_keys(self.account['Second Password']) nxt = WebDriverWait(self.driver, 5).until( EC.element_to_be_clickable((By.ID, 'passwordNext'))) nxt.click() # WebDriverWait(self.driver, 60).until( # EC.frame_to_be_available_and_switch_to_it((By.ID, 'tab1_1')) # ) try: el = WebDriverWait(self.driver, 3).until( EC.visibility_of_element_located((By.ID, 'headingText'))) #open('1.html','w').write(self.driver.page_source) self.verify(el) except TimeoutException: # We're in pass def screenshot(self, name): self.driver.save_screenshot('{}/{}-{}.png'.format( getcwd(), self.account['name'], name)) def solve(self): '''Solve the captcha one time''' WebDriverWait(self.driver, 30).until( EC.frame_to_be_available_and_switch_to_it( (By.XPATH, '//iframe[@title="recaptcha widget"]'))) el = WebDriverWait(self.driver, 20).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, 'div.recaptcha-checkbox-checkmark'))) #lg.info(el el.click() lg.debug('Clicked solve box') def check_style(driver, el): '''Now need to see what happened there. Check an attribute to see if we're successful.''' attr = el.get_attribute('aria-checked') lg.debug(attr) return attr == 'true' lg.debug('Before check_style') timeout = False try: WebDriverWait(self.driver, 20).until(lambda driver: check_style( driver, self.driver.find_element_by_id('recaptcha-anchor'))) except TimeoutException: timeout = True # Next (very soon) we'll see what happened lg.debug('Final: ' + self.driver.find_element_by_id( 'recaptcha-anchor').get_attribute('aria-checked')) self.driver.switch_to.default_content() if timeout: lg.warning('Timeout') self.screenshot('timeout') el = self.driver.find_element_by_xpath( '//iframe[@title="recaptcha challenge"]') #set_trace() self.driver.switch_to.frame(el) l = len(self.driver.page_source) lg.debug(l) with open('recaptcha_main.html', 'w') as f: f.write(self.driver.page_source) if l > 10000: lg.warning('Captcha') self.screenshot('captcha') return True # Need to quit self.driver.switch_to.default_content() self.driver.refresh() else: el = self.driver.find_element_by_id('submit') el.click() # Submit button lg.info('Clicked submit') lg.debug('Before staleness') WebDriverWait(self.driver, 10, poll_frequency=0.1).until(EC.staleness_of(el)) lg.debug('After staleness') def create_driver(self): if 1: caps = DesiredCapabilities().FIREFOX.copy() profile_path = path.expanduser( '~') + '/.mozilla/firefox/' + self.account['name'] # caps['proxy'] = { caps['moz:firefoxOptions'] = { "args": ["-profile", profile_path], # geckodriver 0.18+ } profile = FirefoxProfile(profile_path) #profile.set_preference("general.useragent.override", 'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0') self.driver = Firefox(profile, capabilities=caps) #self.driver = Firefox(profile) else: # PhantomJS # https://github.com/detro/ghostdriver caps = DesiredCapabilities().PHANTOMJS caps["phantomjs.page.settings.userAgent"] = \ 'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0' service_args = [ '--proxy={}'.format(':'.join( self.account['Proxy'].split(':')[:2])), '--proxy-type=http', ] print(service_args) self.driver = PhantomJS(service_args=service_args, capabilities=caps) self.driver.set_window_size(1120, 550) #profile.set_preference("general.useragent.override","Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16") #profile.set_preference("general.useragent.override","Mozilla/5.0 (X11; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0") # profile.set_preference("browser.startup.homepage_override.mstone", "ignore"); # profile.set_preference("startup.homepage_welcome_url.additional", "about:blank"); # profile.set_preference("xpinstall.signatures.required", "false"); # profile.set_preference("toolkit.telemetry.reportingpolicy.firstRun", "false"); def run(self): '''Login and run in cycle''' self.create_driver() try: self.login() tosleep=datetime.combine( date.today(), dt_time(drophour,00,5,tzinfo=timezone.utc))-\ datetime.now(timezone.utc) tosleep = tosleep.seconds lg.info('Sleeping for {}'.format(tosleep)) if '/pooh/' in path.expanduser('~'): tosleep = 0 # don't sleep on developer's host if not debug: sleep(tosleep) # Creating new window to work in (otherwise sometimes the page will ask whether we're ok to leave it) self.driver.execute_script( '''window.open('{}',"_blank");'''.format(solve_url)) self.driver.switch_to.window(self.driver.window_handles[-1]) lg.debug('Created new window') # Cycle here getting tokens until there are no more nocaptcha start_time = end_time = time() # In case we have exception while True: #for i in range(1): if self.solve(): break self.solved += 1 end_time = time() except: lg.exception('In run') self.screenshot('exception') finally: lg.warning('Closing driver') with suppress(WebDriverException): self.driver.quit() rate = (end_time - start_time) / self.solved if self.solved else 0 lg.warning('Solved: {} ({:.2f})'.format(self.solved, rate))
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
class ProviderAdvancedViewTests(LiveServerTestCase): def setUp(self): self.driver = PhantomJS() self.user = User.objects.create_user('admin', '*****@*****.**', 'password') self.user.save() self.provider = Provider( name='provider', user=self.user, ) self.provider.save() self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider) self.login() def tearDown(self): self.driver.quit() def open(self, url): self.driver.get("%s%s" % (self.live_server_url, url)) def login(self): self.open(settings.LOGIN_URL) self.driver.find_element_by_id("id_username").send_keys("admin") self.driver.find_element_by_id("id_password").send_keys("password") self.driver.find_element_by_css_selector("button.btn.btn-default").click() self.assertEqual( self.driver.current_url, self.live_server_url + reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk]), ) def test_can_login(self): """ Test that the user can login """ pass def test_provider_page_has_all_data(self): """ Test that the provider statistics page has all the correct data """ self.open(reverse('advertisements.views.view_provider_statistics', args=[self.provider.pk])) self.assertEqual("Open Ads", self.driver.title) self.assertIn( "{0} advertisements".format(self.provider.name), self.driver.find_element_by_css_selector("h1.page-header").text ) self.assertIn( "{0} advertisements in rotation".format(20), self.driver.find_element_by_css_selector("h1.page-header").text ) def test_advertisement_page_has_all_data(self): """ Test that the advertisement page has all the correct data """ for advert in self.provider_adverts: self.open(reverse('advertisements.views.view_advert_statistics', args=[advert.pk])) self.assertIn( "ID number: {0}".format(advert.pk), self.driver.find_element_by_css_selector("h1.page-header").text, ) self.driver.find_element_by_css_selector("img") self.assertEqual("Active", self.driver.find_element_by_xpath("//td[2]/span").text) self.assertEqual(advert.url, self.driver.find_element_by_link_text(advert.url).text) self.driver.find_element_by_link_text("Edit URL").click() self.assertEqual(advert.url, self.driver.find_element_by_id("id_url").get_attribute("value"))
from selenium.webdriver import PhantomJS from selenium.webdriver.common.keys import Keys from time import sleep driver = PhantomJS() driver.set_window_size(1120, 550) driver.get("https://duckduckgo.com/") driver.find_element_by_id('search_form_input_homepage').send_keys("realpython") driver.find_element_by_id("search_button_homepage").click() print(driver.current_url) driver.quit()
def main(): os.makedirs(dlDir, exist_ok=True) startCatIdx = int(sys.argv[1]) if len(sys.argv) > 1 else 0 startFamIdx = int(sys.argv[2]) if len(sys.argv) > 2 else 0 startPrdIdx = int(sys.argv[3]) if len(sys.argv) > 3 else 0 executor = ThreadPoolExecutor() PhantomJS.waitClickable = waitClickable driver = PhantomJS() # harvest_utils.driver = driver with open('netgear_filelist.csv', 'w') as fout: cw = csv.writer(fout) cw.writerow([ 'model', 'fw_ver', 'fileName', 'fw_url', 'fw_date', 'fileSize', 'sha1', 'md5' ]) driver.get('http://downloadcenter.netgear.com/') # click DrillDown driver.waitClickable( '#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch' ).click() # noqa ctl00 = "#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_adsPanel_" # noqa ignore=E501 # # wait Page2 try: catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory")) numCat = len(catSel.options) for catIdx in range(startCatIdx, numCat): catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory")) print('catIdx=', catIdx) catTxt = catSel.options[catIdx].text uprint('catTxt= ' + catTxt) oldText = driver.getText(ctl00 + "lbProductFamily") catSel.select_by_index(catIdx) driver.waitTextChanged(ctl00 + "lbProductFamily", oldText) famSel = Select(driver.waitClickable(ctl00 + "lbProductFamily")) numFam = len(famSel.options) for famIdx in range(startFamIdx, numFam): famSel = Select( driver.waitClickable(ctl00 + "lbProductFamily")) # noqa print('famIdx=', famIdx) startFamIdx = 0 famTxt = famSel.options[famIdx].text uprint('famTxt= ' + famTxt) oldText = driver.getText(ctl00 + "lbProduct") famSel.select_by_index(famIdx) driver.waitTextChanged(ctl00 + "lbProduct", oldText) prdSel = Select(driver.waitClickable(ctl00 + "lbProduct")) numPrd = len(prdSel.options) for prdIdx in range(startPrdIdx, numPrd): prdSel = Select(driver.waitClickable(ctl00 + "lbProduct")) startPrdIdx = 0 print("catIdx,famIdx,prdIdx=%d, %d, %d" % (catIdx, famIdx, prdIdx)) prdTxt = prdSel.options[prdIdx].text uprint('cat,fam,prd="%s","%s","%s"' % (catTxt, famTxt, prdTxt)) # noqa ignore=E501 prdWaiting = driver.waitElem( ctl00 + "upProgProductLoader > div > img") # noqa ignore=E501 prdSel.select_by_index(prdIdx) try: WebDriverWait(driver, 1, 0.5).\ until(lambda x: prdWaiting.is_displayed() is True) except TimeoutException: pass try: WebDriverWait(driver, 5, 0.5).\ until(lambda x: prdWaiting.is_displayed() is False) except TimeoutException as ex: pass numResults = driver.waitText( ctl00 + "lvwAllDownload_lblAllDownloadResult", 3, 0.5) # noqa ignore=E501 if numResults is None: continue numResults = int(re.search(r"\d+", numResults).group(0)) print('numResults=', numResults) if numResults > 10: driver.waitClickable("#lnkAllDownloadMore", 3).click() try: erItems = driver.getElems( 'a.register-product.navlistsearch', 3, 0.5) # noqa except TimeoutException: erItems = driver.getElems( 'div#LargeFirmware > ul > li > div > p > a.navlistsearch', 3) # noqa ignore=E501 if len(erItems) != numResults: print('Error, numResults=%d, but len(erItems)=%d' % (numResults, len(erItems))) for itemIdx, erItem in enumerate(erItems): if not erItem.is_displayed(): print('itemIdx=%d is not displayed()' % itemIdx) continue erItem.getItemText = getItemText desc = erItem.getElemText(erItem) uprint('desc="%s"' % desc) if 'firmware' not in desc.lower(): continue fw_url = erItem.get_attribute('data-durl') if not fw_url: fw_url = erItem.get_attribute('fw_url') print('fw_url=', fw_url) if not fw_url: continue if not fw_url.startswith('http'): print('Error: fw_url=', fw_url) continue executor.submit(download_file, prdTxt, desc, fw_url) except BaseException as ex: traceback.print_exc() import pdb pdb.set_trace() driver.save_screenshot("netgear_crawler2") finally: driver.quit() executor.shutdown(True)
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
# 将他拷贝到本地文件 w 写 b 二进制 wb代表写入二进制文本 path = Path('./pic') path.mkdir(exist_ok=True) path = path / url_info[1] with open(path, 'wb') as f: f.write(img) driver = PhantomJS() # 创建Chrome对象. # 操作这个对象. driver.get('http://zxgk.court.gov.cn/shixin/') # get方式访问百度. i = 1 j = 10000 while j >= 0: a = driver.find_element_by_id("captchaImg") url = (a.get_attribute('src')) pic_name = f"{i}.png" try: download_img([url, pic_name]) except Exception as e: print(e) continue print(f"{pic_name}已经下载成功,共成功下载{i}张验证码") i += 1 j -= 1 ActionChains(driver).move_to_element(a).click().perform() time.sleep(2) # 防止过快被封ip driver.quit() # 使用完, 记得关闭浏览器, 不然chromedriver.exe进程为一直在内存中.
class PagesCrawler(BaseSpider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error) def start_requests(self): self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO) self.log("ARGUMENTS : "+str(self.args), log.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join( scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID'] ) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS( executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles ) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def crashed(self, spider): self.errors += 1 self.closed("CRASH") def closed(self, reason): if self.errors: self.log("%s error%s encountered during the crawl." % (self.errors, 's' if self.errors > 1 else ''), log.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request) self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction") except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('./') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
class PagesCrawler(Spider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kwargs): mongo = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][MONGO_JOBS_COL] job = mongo.find_one({"_id": kwargs["job_id"]}) args = job["crawl_arguments"] self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['max_depth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.prefixes_trie = LRUTrie() for p in self.follow_prefixes: self.prefixes_trie.set_lru(p, True) for p in self.nofollow_prefixes: self.prefixes_trie.set_lru(p, False) self.discover_prefixes = [ url_to_lru_clean( "http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's'] ] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args[ 'phantom'] and args['phantom'].lower() != "false" self.cookies = None if 'cookies' in args and args["cookies"]: self.cookies = dict( cookie.split('=', 1) for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie) if self.phantom: self.ph_timeout = int( args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int( args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int( args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(PagesCrawler, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.spider_closed, signal=spider_closed) crawler.signals.connect(spider.spider_crashed, signal=spider_error) return spider def start_requests(self): self.log( "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], logging.INFO) self.log("ARGUMENTS : " + str(self.args), logging.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID']) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, logging.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities[ 'phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS(executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def spider_crashed(self, spider): self.errors += 1 self.spider_closed(spider, reason="CRASH") def spider_closed(self, spider, reason=""): if self.errors: self.log( "%s error%s encountered during the crawl (%s)." % (self.errors, 's' if self.errors > 1 else '', reason), logging.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url, TLDS_TREE) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open( os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", logging.INFO) with open( os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", logging.INFO) except SeleniumTimeout: self.log( "Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, logging.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, logging.ERROR) self.errors += 1 except Exception as e: self.log( "Scrolling/Unfolding crashed: %s %s" % (type(e), e), logging.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images( response.body), flags=flags, request=response.request) self.log( "WARNING: page with base64 embedded images was cleaned-up for links extraction" ) except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) error = failure.getErrorMessage() self.log("ERROR : %s" % error, logging.ERROR) if PROXY and not PROXY.startswith( ':') and "OpenSSL.SSL.Error" in error: return self._request(failure.request.url, noproxy=True) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('../'): lrustart = lru[:lru.rfind('|p:')] while redir_url.startswith('../'): lrustart = lrustart[:lrustart.rfind('|p:')] redir_url = redir_url[3:] redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log( "ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), logging.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url, TLDS_TREE) except (ValueError, IndexError) as e: self.log("Error converting URL %s to LRU: %s" % (url, e), logging.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks) def _make_html_page(self, response, lru, lrulinks): p = self._make_raw_page(response, lru) if STORE_HTML: p['body'] = Binary(response.body.encode('zip')) p['lrulinks'] = lrulinks return p def _make_raw_page(self, response, lru): p = self._new_page(response.url, lru) p['status'] = response.status p['size'] = len(response.body) if isinstance(response, HtmlResponse): p['encoding'] = response.encoding if response.meta.get('depth'): p['depth'] = response.meta['depth'] if response.headers.get('content-type'): p['content_type'] = response.headers.get('content-type').partition( ';')[0] p['error'] = None return p def _new_page(self, url, lru=None): if lru is None: lru = url_to_lru_clean(url, TLDS_TREE) p = Page() p['url'] = url p['lru'] = lru p['depth'] = 0 p['timestamp'] = int(time.time() * 1000) return p def _should_follow(self, depth, tolru): c1 = depth < self.maxdepth c2 = self.prefixes_trie.match_lru(tolru) return c1 and c2 def _request(self, url, noproxy=False, **kw): kw['meta'] = {'handle_httpstatus_all': True, 'noproxy': noproxy} kw['callback'] = self.handle_response kw['errback'] = self.handle_error if self.cookies: kw['cookies'] = self.cookies if self.phantom: kw['method'] = 'HEAD' return Request(url, **kw)
def get_applications_in_page(self, scroll_script): applications = [] driver = None try: desired_capabilities = dict(DesiredCapabilities.PHANTOMJS) desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url) service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))] driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args) # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy) if self.proxy_test: driver.get('http://curlmyip.com/') ip = driver.find_element_by_xpath('//body//pre').text print('ip : [ ' + ip + ' ]') pass else: driver.get(self.url) driver.execute_script(scroll_script) acknowledge = 0 done = False while not done: scroll_finished = driver.execute_script("return scraperLoadCompleted") if scroll_finished: if acknowledge == self.acknowledgements: done = driver.execute_script("return scraperLoadCompleted") pass else: acknowledge += 1 pass pass else: acknowledge = 0 pass time.sleep(5) # Wait before retry pass product_matrix = driver.find_elements_by_class_name("card") for application in product_matrix: extracted_application = self.extract_application_data(application) # if extracted_application['app_price'] != -1: applications.append(extracted_application) #pass pass pass driver.quit() pass except Exception as e: if driver is not None: driver.quit() pass if self.attempt < self.retries: self.attempt += 1 time.sleep(10) print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]' applications = self.get_applications_in_page(scroll_script) pass else: print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]') pass pass return applications pass
class CamaraCGCrawler(object): """ Camara CG Ementa Crawler """ def __init__(self, starting_year): self.base_url = "http://187.115.174.90:8080/ScanLexWeb" self.starting_year = starting_year self.browser = None @staticmethod def get_ementa_id(published_date, ementa_type, ementa_doc_number, ementa_situation): """ Return the Ementa Unique Id """ return "%s#%s#%s#%s" % (datetime.strftime( published_date, "%Y-%m-%d"), ementa_type, ementa_doc_number, ementa_situation) def get_all_ementas_summary(self): """ Yield the next ementa information row """ browser_table = self.browser.find_element_by_id( "frmMenu:tabEmentas_data") bs_ementa_table = BeautifulSoup( browser_table.get_attribute("innerHTML")) for row in bs_ementa_table.find_all("tr"): cols = row.find_all("td") if len(cols) == 6: published_date = datetime.strptime( cols[0].span.text.encode("utf-8"), "%d/%m/%Y") doc_number = int(cols[1].span.text.encode("utf-8")) title = cols[2].span.text.encode("utf-8") ementa_type = cols[3].span.text.encode("utf-8") ementa_situation = cols[4].span.text.encode("utf-8") details_js = cols[5].a['onclick'].encode("utf-8") if published_date > datetime.now(): continue yield published_date, doc_number, title, ementa_type, ementa_situation, details_js def get_ementa_details(self, ementa_details_js): """ Crawl the second ementa page """ # Waiting... _ = WebDriverWait(self.browser, 30).until( EC.visibility_of_element_located( (By.ID, "frmfuncao:j_idt13_content"))) _ = WebDriverWait(self.browser, 30).until( EC.visibility_of_element_located( (By.ID, "frmfuncao:tabProponentes"))) # Get Ementail Details bs_ementa_details = BeautifulSoup(self.browser \ .find_element_by_id("frmfuncao:j_idt13_content").get_attribute("innerHTML")) rows = bs_ementa_details.find_all("tr") source = rows[3].td.text main_theme = rows[7].td.text sys_enter_date = datetime.strptime(rows[9].td.text, "%d/%m/%Y") approval_date = datetime.strptime(rows[11].td.text, "%d/%m/%Y") process_number = int(rows[15].td.text or "-1") autograph_number = int(rows[19].td.text or "-1") process_year = int(rows[21].td.text or "-1") has_image = rows[23].td.text == "Sim" # Get Proponent names bs_proponent = BeautifulSoup( self.browser.find_element_by_id( "frmfuncao:tabProponentes").get_attribute("innerHTML")) proponents = ",".join( [col.text for col in bs_proponent.find_all("td")]) return source, proponents, main_theme, sys_enter_date, approval_date, process_number, \ autograph_number, process_year, has_image def next_ementa(self, select_curs): """ Iterate in the years onwards and collect all the ementas """ try: LOGGER.info("Opening Browser") self.browser = PhantomJS() LOGGER.info("GET [%s]", self.base_url) self.browser.maximize_window() cur_year = int(datetime.now().year) # Define the initial collection year select_curs.execute( "SELECT EXTRACT (YEAR FROM MAX(published_date)) FROM ementas;") last_exec_year = select_curs.fetchone() if last_exec_year: collection_year = max(self.starting_year, last_exec_year[0]) else: collection_year = self.starting_year all_proponents = [ "ANDERSON MAIA", "Afonso Alexandre Régis", "Alcides Cavalcante", "Alcindor Villarim", "Aldo Cabral", "Alexandre do Sindicato", "Antonio Pereira", "Antônio Alves Pimentel Filho", "Aragão Júnior", "Bruno Cunha Lima Branco", "Bruno Gaudêncio", "Buchada", "Cassiano Pascoal", "Cozete Babosa", "Cássio Murilo Galdino de Araujo", "Daniella Ribeiro", "Dr. Nunes", "Executivo", "Fabrinni Brito", "Fernando carvalho", "Francisco Dantas Lira", "Galego do Leite", "Inacio Falcao", "Ivan Batista", "Ivonete Ludgerio", "Joao Dantas", "Josimar Henrique da Silva", "José Marcos Raia ", "José Ribamar", "João Dantas", "Jóia Germano", "Laelson Patricio", "Lafite", "Lindaci Medeiros Nápolis", "Lourdes Costa", "Lula Cabral", "Marcos Marinho", "Maria Lopes Barbosa", "Marinaldo Cardoso", "Metuselá Agra", "Miguel Rodrigues da Silva", "Miguel da Construção", "Napoleão Maracajá", "Nelson Gomes Filho", "Olimpio Oliveira", "Orlandino Farias", "Paulo Muniz", "Paulo de Tarso", "Peron Ribeiro Japiassú", "Renato Feliciano", "Rodolfo Rodrigues", "Rodrigo Ramos Victor", "Romero Rodrigues", "Rostand Paraíba", "Rômulo Gouveia", "Saulo Germano", "Saulo Noronha", "Tia Mila", "Tovar Correia Lima", "Vaninho Aragão", "Veneziano Vital do rego", "Walter Brito Neto", "Todos" ] while collection_year <= cur_year: for i_prop in range(len(all_proponents)): ementa_prop = all_proponents[i_prop].decode("utf-8") self.browser.get(self.base_url) # Waiting... WebDriverWait(self.browser, 30).until( EC.element_to_be_clickable((By.ID, "frmMenu:button1"))) LOGGER.info("Collecting Ementas from [%d][%s - %d/%d]", collection_year, ementa_prop, i_prop + 1, len(all_proponents)) # Set Year year_field = self.browser.find_element_by_id("frmMenu:ano") year_field.send_keys(collection_year) # Set Proponent proponent_field = self.browser.find_element_by_id( "frmMenu:autoridade") proponent_field.send_keys(ementa_prop) # Submit the form self.browser.find_element_by_id("frmMenu:button1").click() # Waiting... # _ = WebDriverWait(self.browser, 60).until(EC.visibility_of_element_located((By.ID, "frmMenu:tabEmentas_data"))) time.sleep(3) for published_date, document_number, title, ementa_type, ementa_situation, ementa_details_js in self.get_all_ementas_summary( ): ementa_id = self.get_ementa_id(published_date, ementa_type, document_number, ementa_situation) select_curs.execute(""" SELECT ementa_id FROM ementas WHERE ementa_id = '%s'; """ % ementa_id) if not select_curs.fetchone(): # Run the details script self.browser.execute_script(ementa_details_js) ementa_source, proponents, main_theme, sys_enter_date, approval_date, \ process_number, autograph_number, process_year, has_image = self.get_ementa_details(ementa_details_js) # Come back to the table page self.browser.back() # Waiting... _ = WebDriverWait(self.browser, 60).until( EC.visibility_of_element_located( (By.ID, "frmMenu:tabEmentas_data"))) yield ementa_id, published_date, ementa_type, document_number, title, \ ementa_source, proponents, ementa_situation, main_theme, sys_enter_date, \ approval_date, process_number, autograph_number, process_year, has_image LOGGER.info("DONE [%d]", collection_year) self.browser.back() collection_year += 1 finally: if self.browser: self.browser.quit()
class Client(object): """Client HTTP pour tester fonctionnellement Strass Adapteur du pilote Selenium, avec une interface inspirée de Nightwatch.js, et quelques paramètres spécifiques à Strass.""" def __init__(self): self.driver = PhantomJS() self.driver.set_window_size(1120, 550) def __del__(self): self.driver.quit() def get(self, query=None): server = os.environ.get('STRASS_TEST_SERVER', 'http://localhost:8000') url = server + (query or '/') self.driver.get(url) return self def find(self, selector): return self.driver.find_element_by_css_selector(selector) def click(self, selector): self.find(selector).click() return self def fill(self, selector, value): if isinstance(value, datetime.date): self.fill(selector + ' input.day', str(value.day)) self.fill(selector + ' input.month', str(value.month)) self.fill(selector + ' input.year', str(value.year)) else: control = self.find(selector) try: control.clear() except selexc.InvalidElementStateException: # On doit tenter de nettoyer un input[type=file]. On zap. pass control.send_keys(value) return self def select(self, selector, value): Select(self.find(selector)).select_by_value(value) return self def submit(self, selector='#document button[type=submit]'): return self.click(selector) def close(self): self.driver.close() if self.driver.window_handles: self.driver.switch_to.window(self.driver.window_handles[0]) self.driver.set_window_size(1120, 550) return self def screenshot(self, filename): self.driver.get_screenshot_as_file(filename) sys.stderr.write("Capture d'écran enregistrée dans %r\n" % (filename,)) return self def save(self, filename): with open(filename, 'w') as fo: fo.write(self.driver.page_source) sys.stderr.write("HTML enregistré dans %r\n" % (filename,)) return self def __getattr__(self, name): return getattr(self.driver, name)
class RequestUtil: __browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0' def __init__(self): self.cookies = '' self._lock = threading.RLock() def http_get_request(self, url, referer, timeout=''): self._lock.acquire() cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie), SmartRedirectHandler()) urllib2.install_opener(opener) headers = { 'User-Agent': self.__browserAgent, 'Referer': referer, 'Cache-Control': 'max-age=0', 'Accept': '*/*', 'Connection': 'Keep-Alive', 'Accept-encoding': 'gzip' } req = urllib2.Request(url=url, headers=headers) if timeout == '': open = urllib2.urlopen(req) else: open = urllib2.urlopen(req, timeout=timeout) if self.cookies == '': for item in cookie: self.cookies = self.cookies + item.name + '=' + item.value + ';' self.cookies = self.cookies[:-1] if url != open.url: req = urllib2.Request(url=open.url, headers=headers) self._lock.release() return (open, req) def http_post_request(self, url, datas, referer, timeout=''): self._lock.acquire() postdata = urllib.urlencode(datas) headers = { 'User-Agent': self.__browserAgent, 'Referer': referer, 'Content-Type': 'application/x-www-form-urlencoded', 'Cache-Control': 'no-cache', 'Accept': '*/*', 'Connection': 'Keep-Alive', 'Accept-encoding': 'gzip', 'Cookie': self.cookies } req = urllib2.Request(url=url, data=postdata, headers=headers) req.get_host() if timeout == '': open = urllib2.urlopen(req) else: open = urllib2.urlopen(req, timeout=timeout) if url != open.url: req = urllib2.Request(url=open.url, headers=headers) self._lock.release() return (open, req) def http_get(self, url, refer='https://www.baidu.com'): return self.http_get_request(url, refer, 60) def http_post(self, url, datas, refer='https://www.baidu.com'): return self.http_post_request(url, datas, refer, 60) def http_post_request2(self, url, datas, timeout=''): if timeout == '': open = urllib2.urlopen(url, datas) else: open = urllib2.urlopen(url, datas, timeout=timeout) data = open.read() return data def http_post2(self, url, datas): return self.http_post_request2(url, datas, 300) def create_phandomjs(self, service_args, caps, timeout=30): self.driver = PhantomJS(desired_capabilities=caps, service_args=service_args) self.driver.set_page_load_timeout(timeout) self.driver.set_script_timeout(timeout) self.driver.implicitly_wait(timeout) def close_phandomjs(self): try: self.driver.quit() except: pass def http_get_phandomjs(self, url, refer='https://www.baidu.com', timeout=1000): caps = dict(DesiredCapabilities.PHANTOMJS) caps['browserName'] = 'chrome' caps["phantomjs.page.settings.resourceTimeout"] = timeout caps["phantomjs.page.settings.loadImages"] = False caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent) caps["phantomjs.page.customHeaders.Referer"] = (refer) service_args = [] service_args.append('--load-images=no') service_args.append('--disk-cache=yes') service_args.append('--cookies-file=') self.create_phandomjs(timeout=timeout, service_args=service_args, caps=caps) self.driver.get(url) return self.driver.page_source
class RouteStatistic(object): def __init__(self, url, phantomjs=None, resolution=None, ya_class=None, screen_path=None, screen_pattern=None, csv_path=None): self.url = url self.phantomjs = phantomjs or DEFAULT_PHANTOMJS assert os.path.isfile(self.phantomjs), "phantomjs не найден" resolution = resolution or FULLHD assert isinstance(resolution, (list, tuple)) assert len(resolution) == 2 self.ya_class = ya_class or DEFAULT_YA_CLASS self.screen_path = screen_path or PATH self.screen_pattern = screen_pattern or '%s.png' assert '%s' in self.screen_pattern self.csv_path = csv_path or os_join(PATH, 'statistic.csv') self.driver = PhantomJS(self.phantomjs) self.driver.set_window_size(*resolution) def track(self): self.driver.get(self.url) WebDriverWait(self.driver, 5).until(is_class_exist(self.ya_class)) time = self.driver.find_element_by_class_name(self.ya_class).text now = datetime.now() self._save_screenshot(now) self._update_file(now, *[t.strip() for t in time.split(',')]) def _save_screenshot(self, now): if '%s' in self.screen_pattern: file_name = self.screen_pattern % (now, ) else: file_name = self.screen_pattern file_name = os_join(self.screen_path, file_name) self.driver.save_screenshot(file_name) def _update_file(self, now, time, distance): with open(self.csv_path, 'a') as csvfile: writer = csv.writer(csvfile, delimiter=str('\t')) writer.writerow([ now, time, distance, ]) def __call__(self): return self.track() def __del__(self): if hasattr(self, 'driver') and self.driver: self.driver.service.process.send_signal(signal.SIGTERM) self.driver.quit()