Пример #1
0
def generate_image(structure):
    image_path = os.path.join(mkdtemp(), 'okc.png')
    html_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
        'okc.html',
    )
    url = 'file://{}'.format(html_path)
    driver = PhantomJS(service_log_path=mkstemp()[1])
    driver.set_window_size(2000, 500)
    driver.get(url)
    driver.execute_script('setText({});'.format(json.dumps(structure)))

    if random() > 0.4:
        driver.execute_script('hideForm();')
    elif random() > 0.5:
        driver.execute_script('uncheckForm();')

    driver.set_window_size(*driver.execute_script('return getSize();'))
    driver.save_screenshot(image_path)

    # twitter's gonna make our beautiful screenshot a jpeg unless we make it
    # think that we're using transparency for a reason, so,,
    img = Image.open(image_path)
    origin = img.getpixel((0, 0))
    new_origin = origin[:3] + (254,)
    img.putpixel((0, 0), new_origin)
    img.save(image_path)

    subprocess.check_call(['optipng', '-quiet', image_path])

    return image_path
Пример #2
0
def export(plot, filename, width=800, height=600):
    """
    Export plot to file.

    Args:
        plot (quorra.Plot): Quorra plot object to export.
        width (int): Width for plot (pixels).
        height (int): Height for plot (pixels).
        filename (str): Filename to export to.
    """
    global _phantom, __templates__, __cwd__
    if _phantom is None:
        from selenium.webdriver import PhantomJS
        _phantom = PhantomJS(service_log_path=os.path.devnull)
    tmpl = os.path.join(__templates__, 'export.html')
    exp = os.path.join(__cwd__, '.' + str(uuid.uuid1()) + '.html')
    try:
        with open(tmpl, 'r') as fi, open(exp, 'w') as fo:
            dat = fi.read()
            dat = dat.replace('var plot = undefined;', 'var plot = {};'.format(str(plot)))
            dat = dat.replace('width: 800px;', 'width: {}px;'.format(width))
            dat = dat.replace('height: 500px;', 'height: {}px;'.format(height))
            fo.write(dat)
        _phantom.get('file://' + exp)
        _phantom.save_screenshot(filename.replace('.png', '') + '.png')
    finally:
        if os.path.exists(exp):
            os.remove(exp)
    return
Пример #3
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date,
                  invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url
Пример #4
0
def main():
    os.makedirs(dlDir, exist_ok=True)
    startCatIdx = int(sys.argv[1]) if len(sys.argv) > 1 else 0
    startFamIdx = int(sys.argv[2]) if len(sys.argv) > 2 else 0
    startPrdIdx = int(sys.argv[3]) if len(sys.argv) > 3 else 0
    executor = ThreadPoolExecutor()
    PhantomJS.waitClickable = waitClickable
    driver = PhantomJS()
    # harvest_utils.driver = driver
    with open('netgear_filelist.csv', 'w') as fout:
        cw = csv.writer(fout)
        cw.writerow([
            'model', 'fw_ver', 'fileName', 'fw_url', 'fw_date', 'fileSize',
            'sha1', 'md5'
        ])
    driver.get('http://downloadcenter.netgear.com/')
    # click DrillDown
    driver.waitClickable(
        '#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_BasicSearchPanel_btnAdvancedSearch'
    ).click()  # noqa
    ctl00 = "#ctl00_ctl00_ctl00_mainContent_localizedContent_bodyCenter_adsPanel_"  # noqa ignore=E501
    #
    # wait Page2
    try:
        catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory"))
        numCat = len(catSel.options)
        for catIdx in range(startCatIdx, numCat):
            catSel = Select(driver.waitClickable(ctl00 + "lbProductCategory"))
            print('catIdx=', catIdx)
            catTxt = catSel.options[catIdx].text
            uprint('catTxt= ' + catTxt)
            oldText = driver.getText(ctl00 + "lbProductFamily")
            catSel.select_by_index(catIdx)
            driver.waitTextChanged(ctl00 + "lbProductFamily", oldText)
            famSel = Select(driver.waitClickable(ctl00 + "lbProductFamily"))
            numFam = len(famSel.options)
            for famIdx in range(startFamIdx, numFam):
                famSel = Select(
                    driver.waitClickable(ctl00 + "lbProductFamily"))  # noqa
                print('famIdx=', famIdx)
                startFamIdx = 0
                famTxt = famSel.options[famIdx].text
                uprint('famTxt= ' + famTxt)
                oldText = driver.getText(ctl00 + "lbProduct")
                famSel.select_by_index(famIdx)
                driver.waitTextChanged(ctl00 + "lbProduct", oldText)
                prdSel = Select(driver.waitClickable(ctl00 + "lbProduct"))
                numPrd = len(prdSel.options)
                for prdIdx in range(startPrdIdx, numPrd):
                    prdSel = Select(driver.waitClickable(ctl00 + "lbProduct"))
                    startPrdIdx = 0
                    print("catIdx,famIdx,prdIdx=%d, %d, %d" %
                          (catIdx, famIdx, prdIdx))
                    prdTxt = prdSel.options[prdIdx].text
                    uprint('cat,fam,prd="%s","%s","%s"' %
                           (catTxt, famTxt, prdTxt))  # noqa ignore=E501
                    prdWaiting = driver.waitElem(
                        ctl00 +
                        "upProgProductLoader > div > img")  # noqa ignore=E501
                    prdSel.select_by_index(prdIdx)
                    try:
                        WebDriverWait(driver, 1, 0.5).\
                            until(lambda x: prdWaiting.is_displayed() is True)
                    except TimeoutException:
                        pass
                    try:
                        WebDriverWait(driver, 5, 0.5).\
                            until(lambda x: prdWaiting.is_displayed() is False)
                    except TimeoutException as ex:
                        pass
                    numResults = driver.waitText(
                        ctl00 + "lvwAllDownload_lblAllDownloadResult", 3,
                        0.5)  # noqa ignore=E501
                    if numResults is None:
                        continue
                    numResults = int(re.search(r"\d+", numResults).group(0))
                    print('numResults=', numResults)
                    if numResults > 10:
                        driver.waitClickable("#lnkAllDownloadMore", 3).click()
                    try:
                        erItems = driver.getElems(
                            'a.register-product.navlistsearch', 3, 0.5)  # noqa
                    except TimeoutException:
                        erItems = driver.getElems(
                            'div#LargeFirmware > ul > li > div > p > a.navlistsearch',
                            3)  # noqa ignore=E501

                    if len(erItems) != numResults:
                        print('Error, numResults=%d, but len(erItems)=%d' %
                              (numResults, len(erItems)))
                    for itemIdx, erItem in enumerate(erItems):
                        if not erItem.is_displayed():
                            print('itemIdx=%d is not displayed()' % itemIdx)
                            continue
                        erItem.getItemText = getItemText
                        desc = erItem.getElemText(erItem)
                        uprint('desc="%s"' % desc)
                        if 'firmware' not in desc.lower():
                            continue
                        fw_url = erItem.get_attribute('data-durl')
                        if not fw_url:
                            fw_url = erItem.get_attribute('fw_url')
                        print('fw_url=', fw_url)
                        if not fw_url:
                            continue
                        if not fw_url.startswith('http'):
                            print('Error: fw_url=', fw_url)
                            continue
                        executor.submit(download_file, prdTxt, desc, fw_url)
    except BaseException as ex:
        traceback.print_exc()
        import pdb
        pdb.set_trace()
        driver.save_screenshot("netgear_crawler2")
    finally:
        driver.quit()
        executor.shutdown(True)
Пример #5
0
class RouteStatistic(object):
    def __init__(self,
                 url,
                 phantomjs=None,
                 resolution=None,
                 ya_class=None,
                 screen_path=None,
                 screen_pattern=None,
                 csv_path=None):
        self.url = url

        self.phantomjs = phantomjs or DEFAULT_PHANTOMJS
        assert os.path.isfile(self.phantomjs), "phantomjs не найден"

        resolution = resolution or FULLHD
        assert isinstance(resolution, (list, tuple))
        assert len(resolution) == 2

        self.ya_class = ya_class or DEFAULT_YA_CLASS
        self.screen_path = screen_path or PATH

        self.screen_pattern = screen_pattern or '%s.png'
        assert '%s' in self.screen_pattern

        self.csv_path = csv_path or os_join(PATH, 'statistic.csv')

        self.driver = PhantomJS(self.phantomjs)
        self.driver.set_window_size(*resolution)

    def track(self):
        self.driver.get(self.url)
        WebDriverWait(self.driver, 5).until(is_class_exist(self.ya_class))
        time = self.driver.find_element_by_class_name(self.ya_class).text
        now = datetime.now()
        self._save_screenshot(now)
        self._update_file(now, *[t.strip() for t in time.split(',')])

    def _save_screenshot(self, now):
        if '%s' in self.screen_pattern:
            file_name = self.screen_pattern % (now, )
        else:
            file_name = self.screen_pattern
        file_name = os_join(self.screen_path, file_name)
        self.driver.save_screenshot(file_name)

    def _update_file(self, now, time, distance):
        with open(self.csv_path, 'a') as csvfile:
            writer = csv.writer(csvfile, delimiter=str('\t'))
            writer.writerow([
                now,
                time,
                distance,
            ])

    def __call__(self):
        return self.track()

    def __del__(self):
        if hasattr(self, 'driver') and self.driver:
            self.driver.service.process.send_signal(signal.SIGTERM)
            self.driver.quit()
Пример #6
0
class RequestUtil:
    __browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0'

    def __init__(self):
        self.cookies=''
        self._lock = threading.RLock()

    def http_get_request(self, url, referer, timeout=''):
        self._lock.acquire()
        cookie = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),SmartRedirectHandler())
        urllib2.install_opener(opener)
        headers = {'User-Agent': self.__browserAgent,
                     'Referer': referer,
                     'Cache-Control': 'max-age=0',
                     'Accept': '*/*',
                     'Connection': 'Keep-Alive',
                     'Accept-encoding':'gzip'}
        req = urllib2.Request(url=url,headers=headers)
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if self.cookies == '':
            for item in cookie:
                self.cookies = self.cookies + item.name + '=' + item.value + ';'
            self.cookies = self.cookies[:-1]
        if url != open.url:
            req = urllib2.Request(url=open.url,headers=headers)
        self._lock.release()
        return (open,req)

    def http_post_request(self, url, datas, referer, timeout=''):
        self._lock.acquire()
        postdata = urllib.urlencode(datas)
        headers={'User-Agent': self.__browserAgent,
                     'Referer': referer,
                     'Content-Type': 'application/x-www-form-urlencoded',
                     'Cache-Control': 'no-cache',
                     'Accept': '*/*',
                     'Connection': 'Keep-Alive',
                     'Accept-encoding':'gzip',
                     'Cookie':self.cookies}
        req = urllib2.Request(url=url,data=postdata,headers=headers)
        req.get_host()
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if url != open.url:
            req = urllib2.Request(url=open.url,headers=headers)
        self._lock.release()
        return (open,req)

    def http_get(self, url ,refer='https://www.baidu.com'):
        return self.http_get_request(url, refer, 60)

    def http_post(self, url, datas ,refer='https://www.baidu.com'):
        return self.http_post_request(url, datas, refer, 60)


    def http_post_request2(self, url, datas, timeout=''):
        if timeout == '':
            open = urllib2.urlopen(url,datas)
        else:
            open = urllib2.urlopen(url, datas,timeout=timeout)
        data= open.read()
        return data


    def http_post2(self,url,datas):
        return self.http_post_request2(url,datas,300)


    def create_phandomjs(self, service_args, caps, timeout=30):
        self.driver = PhantomJS(desired_capabilities=caps,service_args = service_args)
        self.driver.set_page_load_timeout(timeout)
        self.driver.set_script_timeout(timeout)
        self.driver.implicitly_wait(timeout)

    def close_phandomjs(self):
        try:
            self.driver.quit()
        except:
            pass

    def http_get_phandomjs(self, url, refer='https://www.baidu.com', timeout=1000):
        caps = dict(DesiredCapabilities.PHANTOMJS)
        caps['browserName'] = 'chrome'
        caps["phantomjs.page.settings.resourceTimeout"] = timeout
        caps["phantomjs.page.settings.loadImages"] = False
        caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent)
        caps["phantomjs.page.customHeaders.Referer"] = (refer)

        service_args=[]
        service_args.append('--load-images=no')
        service_args.append('--disk-cache=yes')
        service_args.append('--cookies-file=')

        self.create_phandomjs(timeout = timeout,service_args = service_args, caps = caps)
        self.driver.get(url)
        self.driver.save_screenshot('hainiu.png')
        return self.driver.page_source
Пример #7
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url
Пример #8
0
class HeadlessBrowser(object):
    def __init__(self):
        self.backend = ['chrome', 'phantomjs']
        self.driver = None
        atexit.register(self.cleanup)

    def __getattribute__(self, item):
        attr = object.__getattribute__(self, item)

        if hasattr(attr, '__call__'):
            func_name = attr.__name__

            if func_name in self.backend:
                def wrap_func(*args, **kwargs):
                    if self.driver is not None:
                        self.cleanup()

                    result = attr(*args, **kwargs)
                    return result
            else:
                def wrap_func(*args, **kwargs):
                    if self.driver is None:
                        logger.warning('Driver is NOT initialized, skip %s' % func_name)
                        return

                    result = attr(*args, **kwargs)
                    return result

            return wrap_func
        else:
            return attr

    def cleanup(self):
        if self.driver is not None:
            logger.info('CLEAN driver: %s' % self.driver)
            self.driver.quit()
            self.driver = None

    def chrome(self, chromedriver_path=None, disable_log=True, strip_ua4headless=True):
        """
        Better to place chromedriver and chrome/chromium binaries in the PATH,
            in this case, parameter chromedriver_path could be omitted and set as None
        Otherwise place them under the same directory and set parameter chromedriver_path
        ---------------------------------------------------------------------------------
        If chromedriver and chrome/chromium are in different path,
            beyond chromedriver_path setting, chrome/chromium path should be set as:
            options.binary_location = '/path'
        """
        options = ChromeOptions()
        options.add_argument('headless')
        options.add_argument('no-sandbox')

        if disable_log:
            options.add_argument('log-level=3')
            options.add_experimental_option('excludeSwitches', ['enable-logging'])

        try:
            if chromedriver_path:
                self.driver = Chrome(options=options,
                                     executable_path=chromedriver_path)
            else:
                self.driver = Chrome(options=options)
        except WebDriverException as e:
            logger.error(e.msg)
            self.driver = None
            return

        # self.driver.set_page_load_timeout(20)
        if strip_ua4headless:
            import re
            ua = re.sub('(?i)headless', '', self.ua())
            self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": ua})

    def phantomjs(self, exe_path=None, disable_log=True, log_path='logs/ghostdriver.log'):
        service_args = []
        if disable_log:
            service_args.append('--webdriver-loglevel=NONE')

        # I know phantomjs is deprecated, but I DO NOT LIKE the warnings...
        import warnings
        backup = warnings.warn
        warnings.warn = str

        try:
            if exe_path:
                self.driver = PhantomJS(executable_path=exe_path,
                                        service_args=service_args,
                                        service_log_path=log_path)
            else:
                self.driver = PhantomJS(service_args=service_args,
                                        service_log_path=log_path)
        except WebDriverException as e:
            logger.error(e.msg)
            self.driver = None
            return
        finally:
            warnings.warn = backup

    def get(self, url, report_html=False):
        if not urlparse(url).scheme:
            url = 'http://%s' % url

        self.driver.get(url)
        return self.driver.page_source if report_html else None

    def ua(self):
        return str(self.driver.execute_script("return navigator.userAgent"))

    def zoom(self, level=1):
        if isinstance(level, (int, float)):
            self.driver.execute_script("document.body.style.zoom = '%s'" % level)

    def capture(self, url, png_name=None, zoom_level=1):
        self.get(url)
        self.zoom(zoom_level)

        if png_name is None or not str(png_name).endswith('.png'):
            result = urlparse(url)
            if not result.scheme:
                result = urlparse('http://%s' % url)
                png_name = '%s.png' % result.netloc

        width = self.driver.execute_script(
            "return Math.max(document.body.scrollWidth, \
                             document.body.offsetWidth, \
                             document.documentElement.clientWidth, \
                             document.documentElement.scrollWidth, \
                             document.documentElement.offsetWidth);")

        height = self.driver.execute_script(
            "return Math.max(document.body.scrollHeight, \
                             document.body.offsetHeight, \
                             document.documentElement.clientHeight, \
                             document.documentElement.scrollHeight, \
                             document.documentElement.offsetHeight);")

        # resize
        self.driver.set_window_size(width, height)
        self.driver.save_screenshot(png_name)