def generate_image(structure): image_path = os.path.join(mkdtemp(), 'okc.png') html_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'okc.html', ) url = 'file://{}'.format(html_path) driver = PhantomJS(service_log_path=mkstemp()[1]) driver.set_window_size(2000, 500) driver.get(url) driver.execute_script('setText({});'.format(json.dumps(structure))) if random() > 0.4: driver.execute_script('hideForm();') elif random() > 0.5: driver.execute_script('uncheckForm();') driver.set_window_size(*driver.execute_script('return getSize();')) driver.save_screenshot(image_path) # twitter's gonna make our beautiful screenshot a jpeg unless we make it # think that we're using transparency for a reason, so,, img = Image.open(image_path) origin = img.getpixel((0, 0)) new_origin = origin[:3] + (254,) img.putpixel((0, 0), new_origin) img.save(image_path) subprocess.check_call(['optipng', '-quiet', image_path]) return image_path
def export(plot, filename, width=800, height=600): """ Export plot to file. Args: plot (quorra.Plot): Quorra plot object to export. width (int): Width for plot (pixels). height (int): Height for plot (pixels). filename (str): Filename to export to. """ global _phantom, __templates__, __cwd__ if _phantom is None: from selenium.webdriver import PhantomJS _phantom = PhantomJS(service_log_path=os.path.devnull) tmpl = os.path.join(__templates__, 'export.html') exp = os.path.join(__cwd__, '.' + str(uuid.uuid1()) + '.html') try: with open(tmpl, 'r') as fi, open(exp, 'w') as fo: dat = fi.read() dat = dat.replace('var plot = undefined;', 'var plot = {};'.format(str(plot))) dat = dat.replace('width: 800px;', 'width: {}px;'.format(width)) dat = dat.replace('height: 500px;', 'height: {}px;'.format(height)) fo.write(dat) _phantom.get('file://' + exp) _phantom.save_screenshot(filename.replace('.png', '') + '.png') finally: if os.path.exists(exp): os.remove(exp) return
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
def main(): os.makedirs(dlDir, exist_ok=True) startCatIdx = int(sys.argv[1]) if len(sys.argv) > 1 else 0 startFamIdx = int(sys.argv[2]) if len(sys.argv) > 2 else 0 startPrdIdx = int(sys.argv[3]) if len(sys.argv) > 3 else 0 executor = ThreadPoolExecutor() PhantomJS.waitClickable = waitClickable driver = PhantomJS() # harvest_utils.driver = driver with open('netgear_filelist.csv', 'w') as fout: cw = csv.writer(fout) cw.writerow([ 'model', 'fw_ver', 'fileName', 'fw_url', 'fw_date', 'fileSize', 'sha1', 'md5' ]) driver.get('http://downloadcenter.netgear.com/') # click DrillDown driver.waitClickable( '#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch' ).click() # noqa ctl00 = "#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_adsPanel_" # noqa ignore=E501 # # wait Page2 try: catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory")) numCat = len(catSel.options) for catIdx in range(startCatIdx, numCat): catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory")) print('catIdx=', catIdx) catTxt = catSel.options[catIdx].text uprint('catTxt= ' + catTxt) oldText = driver.getText(ctl00 + "lbProductFamily") catSel.select_by_index(catIdx) driver.waitTextChanged(ctl00 + "lbProductFamily", oldText) famSel = Select(driver.waitClickable(ctl00 + "lbProductFamily")) numFam = len(famSel.options) for famIdx in range(startFamIdx, numFam): famSel = Select( driver.waitClickable(ctl00 + "lbProductFamily")) # noqa print('famIdx=', famIdx) startFamIdx = 0 famTxt = famSel.options[famIdx].text uprint('famTxt= ' + famTxt) oldText = driver.getText(ctl00 + "lbProduct") famSel.select_by_index(famIdx) driver.waitTextChanged(ctl00 + "lbProduct", oldText) prdSel = Select(driver.waitClickable(ctl00 + "lbProduct")) numPrd = len(prdSel.options) for prdIdx in range(startPrdIdx, numPrd): prdSel = Select(driver.waitClickable(ctl00 + "lbProduct")) startPrdIdx = 0 print("catIdx,famIdx,prdIdx=%d, %d, %d" % (catIdx, famIdx, prdIdx)) prdTxt = prdSel.options[prdIdx].text uprint('cat,fam,prd="%s","%s","%s"' % (catTxt, famTxt, prdTxt)) # noqa ignore=E501 prdWaiting = driver.waitElem( ctl00 + "upProgProductLoader > div > img") # noqa ignore=E501 prdSel.select_by_index(prdIdx) try: WebDriverWait(driver, 1, 0.5).\ until(lambda x: prdWaiting.is_displayed() is True) except TimeoutException: pass try: WebDriverWait(driver, 5, 0.5).\ until(lambda x: prdWaiting.is_displayed() is False) except TimeoutException as ex: pass numResults = driver.waitText( ctl00 + "lvwAllDownload_lblAllDownloadResult", 3, 0.5) # noqa ignore=E501 if numResults is None: continue numResults = int(re.search(r"\d+", numResults).group(0)) print('numResults=', numResults) if numResults > 10: driver.waitClickable("#lnkAllDownloadMore", 3).click() try: erItems = driver.getElems( 'a.register-product.navlistsearch', 3, 0.5) # noqa except TimeoutException: erItems = driver.getElems( 'div#LargeFirmware > ul > li > div > p > a.navlistsearch', 3) # noqa ignore=E501 if len(erItems) != numResults: print('Error, numResults=%d, but len(erItems)=%d' % (numResults, len(erItems))) for itemIdx, erItem in enumerate(erItems): if not erItem.is_displayed(): print('itemIdx=%d is not displayed()' % itemIdx) continue erItem.getItemText = getItemText desc = erItem.getElemText(erItem) uprint('desc="%s"' % desc) if 'firmware' not in desc.lower(): continue fw_url = erItem.get_attribute('data-durl') if not fw_url: fw_url = erItem.get_attribute('fw_url') print('fw_url=', fw_url) if not fw_url: continue if not fw_url.startswith('http'): print('Error: fw_url=', fw_url) continue executor.submit(download_file, prdTxt, desc, fw_url) except BaseException as ex: traceback.print_exc() import pdb pdb.set_trace() driver.save_screenshot("netgear_crawler2") finally: driver.quit() executor.shutdown(True)
class RouteStatistic(object): def __init__(self, url, phantomjs=None, resolution=None, ya_class=None, screen_path=None, screen_pattern=None, csv_path=None): self.url = url self.phantomjs = phantomjs or DEFAULT_PHANTOMJS assert os.path.isfile(self.phantomjs), "phantomjs не найден" resolution = resolution or FULLHD assert isinstance(resolution, (list, tuple)) assert len(resolution) == 2 self.ya_class = ya_class or DEFAULT_YA_CLASS self.screen_path = screen_path or PATH self.screen_pattern = screen_pattern or '%s.png' assert '%s' in self.screen_pattern self.csv_path = csv_path or os_join(PATH, 'statistic.csv') self.driver = PhantomJS(self.phantomjs) self.driver.set_window_size(*resolution) def track(self): self.driver.get(self.url) WebDriverWait(self.driver, 5).until(is_class_exist(self.ya_class)) time = self.driver.find_element_by_class_name(self.ya_class).text now = datetime.now() self._save_screenshot(now) self._update_file(now, *[t.strip() for t in time.split(',')]) def _save_screenshot(self, now): if '%s' in self.screen_pattern: file_name = self.screen_pattern % (now, ) else: file_name = self.screen_pattern file_name = os_join(self.screen_path, file_name) self.driver.save_screenshot(file_name) def _update_file(self, now, time, distance): with open(self.csv_path, 'a') as csvfile: writer = csv.writer(csvfile, delimiter=str('\t')) writer.writerow([ now, time, distance, ]) def __call__(self): return self.track() def __del__(self): if hasattr(self, 'driver') and self.driver: self.driver.service.process.send_signal(signal.SIGTERM) self.driver.quit()
class RequestUtil: __browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0' def __init__(self): self.cookies='' self._lock = threading.RLock() def http_get_request(self, url, referer, timeout=''): self._lock.acquire() cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),SmartRedirectHandler()) urllib2.install_opener(opener) headers = {'User-Agent': self.__browserAgent, 'Referer': referer, 'Cache-Control': 'max-age=0', 'Accept': '*/*', 'Connection': 'Keep-Alive', 'Accept-encoding':'gzip'} req = urllib2.Request(url=url,headers=headers) if timeout == '': open = urllib2.urlopen(req) else: open = urllib2.urlopen(req, timeout=timeout) if self.cookies == '': for item in cookie: self.cookies = self.cookies + item.name + '=' + item.value + ';' self.cookies = self.cookies[:-1] if url != open.url: req = urllib2.Request(url=open.url,headers=headers) self._lock.release() return (open,req) def http_post_request(self, url, datas, referer, timeout=''): self._lock.acquire() postdata = urllib.urlencode(datas) headers={'User-Agent': self.__browserAgent, 'Referer': referer, 'Content-Type': 'application/x-www-form-urlencoded', 'Cache-Control': 'no-cache', 'Accept': '*/*', 'Connection': 'Keep-Alive', 'Accept-encoding':'gzip', 'Cookie':self.cookies} req = urllib2.Request(url=url,data=postdata,headers=headers) req.get_host() if timeout == '': open = urllib2.urlopen(req) else: open = urllib2.urlopen(req, timeout=timeout) if url != open.url: req = urllib2.Request(url=open.url,headers=headers) self._lock.release() return (open,req) def http_get(self, url ,refer='https://www.baidu.com'): return self.http_get_request(url, refer, 60) def http_post(self, url, datas ,refer='https://www.baidu.com'): return self.http_post_request(url, datas, refer, 60) def http_post_request2(self, url, datas, timeout=''): if timeout == '': open = urllib2.urlopen(url,datas) else: open = urllib2.urlopen(url, datas,timeout=timeout) data= open.read() return data def http_post2(self,url,datas): return self.http_post_request2(url,datas,300) def create_phandomjs(self, service_args, caps, timeout=30): self.driver = PhantomJS(desired_capabilities=caps,service_args = service_args) self.driver.set_page_load_timeout(timeout) self.driver.set_script_timeout(timeout) self.driver.implicitly_wait(timeout) def close_phandomjs(self): try: self.driver.quit() except: pass def http_get_phandomjs(self, url, refer='https://www.baidu.com', timeout=1000): caps = dict(DesiredCapabilities.PHANTOMJS) caps['browserName'] = 'chrome' caps["phantomjs.page.settings.resourceTimeout"] = timeout caps["phantomjs.page.settings.loadImages"] = False caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent) caps["phantomjs.page.customHeaders.Referer"] = (refer) service_args=[] service_args.append('--load-images=no') service_args.append('--disk-cache=yes') service_args.append('--cookies-file=') self.create_phandomjs(timeout = timeout,service_args = service_args, caps = caps) self.driver.get(url) self.driver.save_screenshot('hainiu.png') return self.driver.page_source
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
class HeadlessBrowser(object): def __init__(self): self.backend = ['chrome', 'phantomjs'] self.driver = None atexit.register(self.cleanup) def __getattribute__(self, item): attr = object.__getattribute__(self, item) if hasattr(attr, '__call__'): func_name = attr.__name__ if func_name in self.backend: def wrap_func(*args, **kwargs): if self.driver is not None: self.cleanup() result = attr(*args, **kwargs) return result else: def wrap_func(*args, **kwargs): if self.driver is None: logger.warning('Driver is NOT initialized, skip %s' % func_name) return result = attr(*args, **kwargs) return result return wrap_func else: return attr def cleanup(self): if self.driver is not None: logger.info('CLEAN driver: %s' % self.driver) self.driver.quit() self.driver = None def chrome(self, chromedriver_path=None, disable_log=True, strip_ua4headless=True): """ Better to place chromedriver and chrome/chromium binaries in the PATH, in this case, parameter chromedriver_path could be omitted and set as None Otherwise place them under the same directory and set parameter chromedriver_path --------------------------------------------------------------------------------- If chromedriver and chrome/chromium are in different path, beyond chromedriver_path setting, chrome/chromium path should be set as: options.binary_location = '/path' """ options = ChromeOptions() options.add_argument('headless') options.add_argument('no-sandbox') if disable_log: options.add_argument('log-level=3') options.add_experimental_option('excludeSwitches', ['enable-logging']) try: if chromedriver_path: self.driver = Chrome(options=options, executable_path=chromedriver_path) else: self.driver = Chrome(options=options) except WebDriverException as e: logger.error(e.msg) self.driver = None return # self.driver.set_page_load_timeout(20) if strip_ua4headless: import re ua = re.sub('(?i)headless', '', self.ua()) self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": ua}) def phantomjs(self, exe_path=None, disable_log=True, log_path='logs/ghostdriver.log'): service_args = [] if disable_log: service_args.append('--webdriver-loglevel=NONE') # I know phantomjs is deprecated, but I DO NOT LIKE the warnings... import warnings backup = warnings.warn warnings.warn = str try: if exe_path: self.driver = PhantomJS(executable_path=exe_path, service_args=service_args, service_log_path=log_path) else: self.driver = PhantomJS(service_args=service_args, service_log_path=log_path) except WebDriverException as e: logger.error(e.msg) self.driver = None return finally: warnings.warn = backup def get(self, url, report_html=False): if not urlparse(url).scheme: url = 'http://%s' % url self.driver.get(url) return self.driver.page_source if report_html else None def ua(self): return str(self.driver.execute_script("return navigator.userAgent")) def zoom(self, level=1): if isinstance(level, (int, float)): self.driver.execute_script("document.body.style.zoom = '%s'" % level) def capture(self, url, png_name=None, zoom_level=1): self.get(url) self.zoom(zoom_level) if png_name is None or not str(png_name).endswith('.png'): result = urlparse(url) if not result.scheme: result = urlparse('http://%s' % url) png_name = '%s.png' % result.netloc width = self.driver.execute_script( "return Math.max(document.body.scrollWidth, \ document.body.offsetWidth, \ document.documentElement.clientWidth, \ document.documentElement.scrollWidth, \ document.documentElement.offsetWidth);") height = self.driver.execute_script( "return Math.max(document.body.scrollHeight, \ document.body.offsetHeight, \ document.documentElement.clientHeight, \ document.documentElement.scrollHeight, \ document.documentElement.offsetHeight);") # resize self.driver.set_window_size(width, height) self.driver.save_screenshot(png_name)