예제 #1
0
 def __init__(self):
     log.start(logfile=time.strftime("log/%Y%m%d%H%M%S")+".log",logstdout=False)
     log.msg("initiating crawler...",level=log.INFO)
     self.crawler_id = self.get_crawler_id()
     log.msg("crawler id is %s" % self.crawler_id,level=log.INFO)
     self.r.set('crawler:ip:%s' % self.crawler_id,utils.get_external_ip())
     self.r.set('crawler:port:%s' % self.crawler_id,settings.REDIS_LOCAL_PORT)
     self.r.set('crawler:mapping_port:%s' % self.crawler_id,settings.REDIS_LOCAL_MAPPING_PORT)
     log.msg("crawler ip is %s, port is %d" % (utils.get_external_ip(),settings.REDIS_LOCAL_PORT),level=log.INFO)
     account = self.get_account()
     self.username = account[0]
     self.password = account[1]
     log.msg("crawler account got",level=log.INFO)
     self.r_local.set('crawler:status:%s' % self.crawler_id, 'good')
     self.r_local.set('crawler:update_time:%s' % self.crawler_id, datetime.datetime.utcnow().strftime("%s"))
     log.msg("local crawler status set",level=log.INFO)
     heartbeat_thread = threading.Thread(target=self.maintain_local_heartbeat)
     heartbeat_thread.start()
     log.msg("local crawler heartbeat started",level=log.INFO)
     if platform.system() == "Linux":
         #on linux, use virtual display
         vdisplay = Xvfb()
         vdisplay.start()
     co = ChromeOptions()
     #TODO: Disable image after log in
     #TODO: optimize memory usage
     co.add_experimental_option("prefs",{"profile.default_content_settings":{"popups":1}})
     #co.add_experimental_option("prefs",{"profile.default_content_settings":{"popups":1,"images":2,"media":2}})
     self.driver = webdriver.Chrome(chrome_options=co)
     self.driver.set_window_size(640,960)
 def get_chrome(additional_options: ChromeOptions=None) -> webdriver.Chrome:
     options = ChromeOptions()
     if additional_options == None:
         additional_options = options
     download_option = {'download.default_directory': get_download_file_path(),
                        'download.directory_upgrade': 'true',
                        'download.extensions_to_open': '',
                        }
     options.add_experimental_option('prefs', download_option)
     return webdriver.Chrome(get_chrome_exe_path(), desired_capabilities=options.to_capabilities(),
         chrome_options=additional_options)
예제 #3
0
def get_cookie(username, password, proxy):
    """
    Launch a chrome to get cookies
    """
    chromeopts = ChromeOptions()
    if proxy:
        chromeopts.add_argument('--proxy-server=%s' % proxy)

    web = Chrome(chrome_options=chromeopts)
    try:
        return signin(web, username, password)
    finally:
        web.quit()
예제 #4
0
파일: browsers.py 프로젝트: DramaFever/sst
    def setup_for_test(self, test):
        chrome_options = ChromeOptions()
        chrome_options.add_argument("test-type")
        chrome_options.add_argument("disable-infobars")
        chrome_options.add_experimental_option('prefs', {
            'credentials_enable_service': False,
            'profile.password_manager_enabled': False,
            'profile.default_content_setting_values.plugins': 1,
            'profile.content_settings.plugin_whitelist.adobe-flash-player': 1,
            'profile.content_settings.exceptions.plugins.*,*.per_resource.adobe-flash-player': 1
        })

        if test.use_proxy:
            chrome_options.add_argument("--proxy-server={0}".format(test.proxy_address))
        self.capabilities = chrome_options.to_capabilities()
        logger.debug("Chrome capabilities: {}".format(self.capabilities))
예제 #5
0
 def setUpClass(cls):
     super(SeleniumTestCase, cls).setUpClass()
     print('Initializing browser engine...')
     if sys.platform == 'win32':
         # Chrome hangs up on Windows
         capabilities = DesiredCapabilities.FIREFOX
         capabilities['loggingPrefs'] = {'browser': 'ALL'}
         cls.browser = Firefox(capabilities=capabilities)
     else:
         capabilities = DesiredCapabilities.CHROME
         capabilities['loggingPrefs'] = {'browser': 'ALL'}
         options = ChromeOptions()
         options.add_argument('headless')
         options.add_argument('disable-gpu')
         cls.browser = Chrome(chrome_options=options,
                              desired_capabilities=capabilities)
     print('Browser engine initialized.')
예제 #6
0
파일: setup.py 프로젝트: dropbox/grouper
def selenium_browser():
    # type: () -> Chrome
    options = ChromeOptions()
    options.add_argument("headless")
    options.add_argument("no-sandbox")
    options.add_argument("window-size=1920,1080")
    return Chrome(options=options)
예제 #7
0
    def setUp(self, browser):
        self.browser = browser

        if "firefox" in self.browser:
            profile = FirefoxProfile()
            # profile.set_preference("plugin.state.silverlight", 2)
            # profile.set_preference("browser.download.folderList", 1)
            # profile.set_preference("pdfjs.disabled", False);
            # profile.set_preference("pdfjs.firstRun", True);
            self.driver = Firefox(profile)  # get a new firefox session

        if "chrome" in self.browser:
            chromedriver = "/usr/local/bin/chromedriver"
            options = ChromeOptions()
            options.add_experimental_option('excludeSwitches', ['disable-component-update'])
            options.add_argument("--user-data-dir=./browser_resources/chrome_data_dir/")
            os.environ["webdriver.chrome.driver"] = chromedriver
            self.driver = Chrome(executable_path=chromedriver, chrome_options=options)

        self.home_page = home.Home(self.driver)
def login(account, passwd, url):
    # 如果driver没加入环境变量中,那么就需要明确指定其路径
    # 验证于2017年4月11日
    # 直接登陆新浪微博
    chrome_options = ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    driver = webdriver.Chrome('/root/qk_python/python/data/collect/weibo_spider/priv/chromedriver',
                              chrome_options=chrome_options)
    driver.maximize_window()
    driver.set_page_load_timeout(30)
    driver.set_window_size(1124, 850)
    # locator = (By.)
    driver.get(url)
    print('开始登陆')
    name_field = driver.find_element_by_id('loginname')
    name_field.clear()
    name_field.send_keys(account)
    password_field = driver.find_element_by_class_name('password').find_element_by_name('password')
    password_field.clear()
    password_field.send_keys(passwd)

    submit = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a/span')

    ActionChains(driver).double_click(submit).perform()
    time.sleep(5)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'WB_miniblog')))

    source = driver.page_source

    if is_login(source):
        print('登录成功')

    sina_cookies = driver.get_cookies()
    driver.quit()
    return sina_cookies
예제 #9
0
def setup_browser():
    if use_firefox:
        world.browser = MyFirefox()
        world.browser.set_window_size(450, 1200)
        world.browser.set_window_position(0, 0)
        #world.browser.maximize_window()
    elif use_phantomjs:
        world.browser = MyPhantomJS()
    elif use_headless_chrome:
        options = ChromeOptions()
        options.add_argument("--window-size=1005,9999")
        options.add_argument("--headless");
        world.browser = MyChrome(executable_path=os.path.join('..', '..', 'chromedriver'), chrome_options=options)
    else:
        options = ChromeOptions()
        options.add_argument("--start-maximized");
        world.browser = MyChrome(executable_path=os.path.join('..', '..', 'chromedriver'), chrome_options=options)
    world.da_path = default_path
    world.wait_seconds = default_wait_seconds
    def create_download_dir_capabilities_for_chrome(path_to_download, **extensions_files):
        """
        Example use
        | ${capabilities} |	create_download_dir_capabilities_for_chrome	| Artifacts |
        | Open Browser Extension | https://support.spatialkey.com/spatialkey-sample-csv-data/ |	gc | desired_capabilities=${capabilities} |
        | Click Element	 | //a[contains(@href,'sample.csv.zip')] |
        """

        path_to_download_check = validate_create_artifacts_dir(path_to_download)

        chrome_options = ChromeOptions()
        prefs = {"download.default_directory": path_to_download_check, "directory_upgrade": "true"}

        chrome_options.add_experimental_option("prefs", prefs)
        chrome_options.add_argument("--disable-web-security")
        for single_extension in extensions_files:
            chrome_options.add_extension(single_extension)

        logger.info("Chrome Capabilities set download dir '" + path_to_download_check + "'")
        return chrome_options.to_capabilities()
        def set_chrome_options(self):
            from selenium.webdriver             import ChromeOptions
            opts                            =   ChromeOptions()

            ### Add Boolean Arguments
            if T.has_key('true_opts'):
                for it in T['true_opts']:
                    opts.add_argument(          '%s=1' % it )
            if T.has_key('false_opts'):
                for it in T['false_opts']:
                    opts.add_argument(          '%s=0' % it )

            value_opts                      =   [
                                                 'profile-directory',
                                                 'log-level',                   # 0 to 3: INFO = 0, WARNING = 1, LOG_ERROR = 2, LOG_FATAL = 3
                                                 'net-log-capture-mode',        # "Default" "IncludeCookiesAndCredentials" "IncludeSocketBytes"'
                                                 'register-font-files',         # might be windows only
                                                 'remote-debugging-port',
                                                 'user-agent',
                                                 'user-data-dir',               # don't use b/c it negates no-extension options
                                                 ]

            ### Add Value Arguments
            for it in value_opts:
                if T.has_key(it):
                    opts.add_argument(           '%s=%s' % (it,T[it]) )

            ### OTHER CHROME OPTIONS NOT YET FULLY CONFIGURED

            # -extensions        list str
            # -localState        dict
            # -prefs             dict
            # set_profile()

            # -detach            bool
            # -debuggerAddress   str
            # -excludeSwitches   list str
            # -minidumpPath      str
            # -mobileEmulation   dict

            # -perfLoggingPrefs             OBJECT (dict)
            # set_performance_logging()

            return opts
예제 #12
0
    def set_spider_option(self, use_proxy=False) -> Chrome:
        """
        ChromeDriver settings
        @param use_proxy: 使用代理 <当前版本禁用>:部分机场禁止国内ip访问
        @return:
        """

        options = ChromeOptions()

        # 最高权限运行
        options.add_argument('--no-sandbox')

        # 隐身模式
        options.add_argument('-incognito')

        # 无缓存加载
        options.add_argument('--disk-cache-')

        # 设置中文
        options.add_argument('lang=zh_CN.UTF-8')

        options.add_experimental_option('excludeSwitches',
                                        ['enable-automation'])

        # 更换头部
        options.add_argument(f'user-agent={get_header()}')

        if use_proxy:
            proxy_ip = get_proxy(True)
            if proxy_ip:
                options.add_argument(f'proxy-server={proxy_ip}')

        # 静默启动
        if self.silence is True:
            options.add_argument('--headless')

        # 无反爬虫机制:高性能启动,禁止图片加载及js动画渲染,加快selenium页面切换效率
        def load_anti_module():
            chrome_pref = {
                "profile.default_content_settings": {
                    "Images": 2,
                    'javascript': 2
                },
                "profile.managed_default_content_settings": {
                    "Images": 2
                }
            }
            options.experimental_options['prefs'] = chrome_pref
            options.add_experimental_option('excludeSwitches',
                                            ['enable-automation'])
            d_c = DesiredCapabilities.CHROME
            d_c['pageLoadStrategy'] = 'none'
            return Chrome(options=options,
                          executable_path=CHROMEDRIVER_PATH,
                          desired_capabilities=d_c)

        if self.anti is False:
            return load_anti_module()
        else:
            # 有反爬虫/默认:一般模式启动
            return Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
예제 #13
0
from selenium.webdriver import Chrome, ChromeOptions
import time

EMAIL_ID = "<email id here>"


def slow_typing(element, text):
    for character in text:
        element.send_keys(character)
        time.sleep(0.3)


# Visit chrome://version/ and copy profile path in place of '<chrome user profile>'
options = ChromeOptions().add_argument("--user-data-dir=<chrome user profile>")

browser = Chrome(chrome_options=options)
browser.get('https://www.browserstack.com')

time.sleep(2)

# to accept cookie notification so that it doesn't interfare
cookie_cta = browser.find_element_by_id('accept-cookie-notification')
cookie_cta.click()

# Navigate to Signup Page
button = browser.find_element_by_id('signupModalButton')
button.click()

time.sleep(2)

# Fill user's full name
예제 #14
0
 def create(**kwargs: Any):
     options = ChromeOptions()
     options.headless = driver_is_headless
     driver = create_simple_selenium_web_driver(driver_options=options, **kwargs)
     drivers.append(driver)
     return driver
    client = pymongo.MongoClient(database_ip, database_port)
    db = client[database_name]

    update = UpdateCrawler()
    new_data = []
    data2update = list(db['hotspot'].find())

    header_list = [
        'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36"',
        'user-agent="Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"',
        'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"'
    ]

    # 头条部分的设置
    option = ChromeOptions()
    prefs = {
        "profile.managed_default_content_settings.images": 2,
        'permissions.default.stylesheet': 2
    }
    option.add_experimental_option("prefs", prefs)
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    header = random.choice(header_list)
    option.add_argument(header)
    option.add_argument('--headless')
    option.add_argument('--disable-gpu')  # 无头浏览器
    driver = Chrome(options=option)
    num = 0

    loop = asyncio.get_event_loop()
    tasks = []
 def __init__(self):
     chrome_options = ChromeOptions()
     chrome_options.set_headless(True)
     super().__init__(chrome_options=chrome_options)
예제 #17
0
def create_driver():
    # load chrome preferences
    chrome_pref_file = open(chrome_prefs_path, 'r')
    prefs = json.load(chrome_pref_file)

    options = ChromeOptions()

    options.headless = True
    options.add_experimental_option('prefs', prefs)
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)

    return Chrome(options=options, executable_path=chrome_driver_path)
예제 #18
0
class CrackTouClick():
    def __init__(self):
        self.url = 'https://passport.cnblogs.com/user/signin'
        self.option = ChromeOptions()
        self.option.add_experimental_option('excludeSwitches',
                                            ['enable-automation'])
        self.browser = Chrome(options=self.option)
        self.wait = WebDriverWait(self.browser, 20)
        self.email = EMAIL
        self.password = PASSWORD
        self.chaojiying = Chaojiying(CHAOJIYING_USERNAME, CHAOJIYING_PASSWORD,
                                     CHAOJIYING_SOFT_ID)

    def open(self):
        """
        打开网页输入用户名密码
        以及点击验证按钮的操作
        """
        self.browser.get(self.url)
        email = self.wait.until(
            EC.presence_of_element_located(
                (By.ID, 'input1'))).send_keys(self.email)
        password = self.wait.until(
            EC.presence_of_element_located(
                (By.ID, 'input2'))).send_keys(self.password)
        button = self.wait.until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'button'))).click()
        button2 = self.wait.until(
            EC.element_to_be_clickable(
                (By.CLASS_NAME, 'geetest_btn'))).click()

        try:
            #slider判断是滑块验证码还是字体验证码
            slider = self.wait.until(
                EC.element_to_be_clickable(
                    (By.CLASS_NAME, 'geetest_slider_button')))
            CHAOJIYING_KIND = 9202  #验证码类型
            print('这是滑块验证码')
            image = self.get_touclick_image()  #获取图片
            # 识别验证码
            result = self.chaojiying.post_pic(bytes_array.getvalue(),
                                              CHAOJIYING_KIND)
            print('验证码位置', result['pic_str'])
            print(result)
            locations = self.get_points(result)
            self.touch_click_words(locations)
            print('正在检测错误,此处延迟3秒,以便等待页面加载')
            time.sleep(3)
            self.img_error(result)
        except Exception as e:
            print('这是字体验证码', e)
            CHAOJIYING_KIND = 9103  #验证码类型
            # 获取验证码图片
            image = self.get_touclick_image()  #获取图片
            bytes_array = BytesIO()
            image.save(bytes_array, format='PNG')  ##保存一张图片来参照

            # 识别验证码
            result = self.chaojiying.post_pic(bytes_array.getvalue(),
                                              CHAOJIYING_KIND)
            print(result)

            locations = self.get_points(result)
            self.touch_click_words2(locations)
            print('正在检测错误,此处延迟3秒,以便等待页面加载')
            time.sleep(3)
            self.img_error(result)

    def touch_click_verify(self):
        """
        获取滑块按钮
        :return: None
        """

        slider = self.wait.until(
            EC.element_to_be_clickable(
                (By.CLASS_NAME, 'geetest_slider_button')))
        return slider

    def get_touclick_element(self):
        """
        获取验证图片对象
        :return: 图片对象
        """
        element = self.wait.until(
            EC.presence_of_element_located(
                (By.CLASS_NAME, 'geetest_canvas_slice')))
        return element

    def get_position(self):
        """
        获取验证码位置
        :return: 验证码位置元组
        """
        element = self.get_touclick_element()
        time.sleep(2)
        location = element.location
        size = element.size
        top, bottom, left, right = location['y'], location['y'] + size[
            'height'], location['x'], location['x'] + size['width']
        return (top, bottom, left, right)

    def get_screenshot(self):
        """
        获取网页截图
        :return: 截图对象
        """
        screenshot = self.browser.get_screenshot_as_png()
        screenshot = Image.open(BytesIO(screenshot))
        return screenshot

    def get_touclick_image(self, name='captcha.png'):
        """
        获取验证码图片
        :return: 图片对象
        """
        top, bottom, left, right = self.get_position()
        screenshot = self.get_screenshot()
        captcha = screenshot.crop((left, top, right, bottom))
        captcha.save(name)
        return captcha

    def get_points(self, captcha_result):
        """
        解析识别结果
        :param captcha_result: 识别结果
        :return: 转化后的结果
        """
        groups = captcha_result.get('pic_str').split('|')
        locations = [[int(number) for number in group.split(',')]
                     for group in groups]
        return locations

    def touch_click_words(self, locations):
        """
        点击滑块验证图片
        :param locations: 点击位置
        :return: None
        """
        for location in locations:
            print(location)
            ActionChains(self.browser).drag_and_drop_by_offset(
                self.touch_click_verify(), location[0], location[1]).perform()
            time.sleep(1)

    def touch_click_words2(self, locations):
        """
        点击字体验证图片
        :param locations: 点击位置
        :return: None
        """
        for location in locations:
            print(location)
            ActionChains(self.browser).move_to_element_with_offset(
                self.get_touclick_element(), location[0],
                location[1]).click().perform()
            time.sleep(1)

    def img_error(self, result):
        #检测验证码有没有出错,这步老是报出栈错误
        #无奈之下只能采取解析式来判断登录前后页面的数据了
        #不得不吐槽的是,这里的滑块验证码坐标识别率低到了令人发指的地步
        test = etree.HTML(self.browser.page_source)
        title = test.xpath('//*[@id="app_ing"]/text()')
        print('爬取登陆前后的数据变化', title)
        if title == []:
            img_id = result['pic_id']
            self.chaojiying.report_error(img_id)
            print('登录失败,已发送错误验证码')
            self.open()

        else:
            print('登录成功')
예제 #19
0
    def set_spider_option(self, header=None) -> Chrome:
        """

        :param header:
        :return:
        """
        # 实例化Chrome可选参数
        options = ChromeOptions()
        # 最高权限运行
        options.add_argument('--no-sandbox')
        # 隐身模式
        options.add_argument('-incognito')
        # 无缓存加载
        options.add_argument('--disk-cache-')
        # 设置中文
        options.add_argument('lang=zh_CN.UTF-8')
        # 禁用 DevTools listening
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        options.add_argument('--log-level=3')
        # 更换头部
        if header:
            options.add_argument(f"user-agent={header}")
        else:
            options.add_argument(f'user-agent={get_header()}')
        # 静默启动
        if self.silence is True:
            options.add_argument('--headless')
            options.add_argument('--disable-gpu')
            options.add_argument("--disable-software-rasterizer")
        # 抑制自动化控制特征
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option('useAutomationExtension', False)
        options.add_experimental_option('excludeSwitches', ['enable-automation'])
        # 加速模式,增加Selenium渲染效率
        if self.assault:
            chrome_pref = {"profile.default_content_settings": {"Images": 2, 'javascript': 2},
                           "profile.managed_default_content_settings": {"Images": 2}}
            options.experimental_options['prefs'] = chrome_pref
            d_c = DesiredCapabilities.CHROME
            d_c['pageLoadStrategy'] = 'none'
            _api = Chrome(
                options=options,
                executable_path=CHROMEDRIVER_PATH,
                desired_capabilities=d_c
            )
        else:
            _api = Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
        # 进一步消除操作指令头,增加隐蔽性
        _api.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
            Object.defineProperty(navigator, 'webdriver', {
              get: () => undefined
            })
          """
        })
        return _api
예제 #20
0
def set_driver(isHeadless=False, isManager=False, isSecret=False, isExtension=False, extension_path='', profile_path=''):

    options = ChromeOptions()

    user_agent = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    ]
    
    if os.name == 'nt':  # Windows
        driver_path = 'chromedriver.exe'
    elif os.name == 'posix':  # Mac
        driver_path = 'chromedriver'

    if isHeadless:
        options.add_argument('--headless')
        options.add_argument('--single-process')

    if isExtension:
        if extension_path:
            options.add_extension(extension_path)
    else:
        options.add_argument('--disable-extensions')

    if isSecret:
        options.add_argument('--incognito')  # シークレットモードの設定を付与
    else:
        # プロファイル設定することで、初回手動でログインや拡張機能追加したものを2回目以降使用可能
        # シークレットモードではプロファイル設定を使用できない
        # ヘッドレスモードではプロファイル設定、Chrome拡張機能を使用できない
        # 拡張機能を有効にして、以下のエラーが出た場合、その拡張機能は使用できない
        # failed to wait for extension background page to load
        # その場合は、プロファイル設定にて手動で機能を追加して、ヘッドレスモードかつ拡張機能Enableで使用する
        if (not isHeadless) or (not isExtension):
            options.add_argument('--user-data-dir=' + profile_path)

    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('log-level=3')
    options.add_argument('--ignore-ssl-errors')
    options.add_argument(f'--user-agent={user_agent[random.randrange(0, len(user_agent), 1)]}')
    options.add_argument('--start-maximized')
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--allow-running-insecure-content')
    options.add_argument('--disable-web-security')
    options.add_argument('--disable-desktop-notifications')
    options.add_argument('--disable-application-cache')
    options.add_argument('--lang=ja')


    if isManager:  # 自動取得
        try:
            driver = Chrome(ChromeDriverManager().install(), options=options)
        except InvalidArgumentException as err:
            logger.error(err)
            logger.error('既存のブラウザを閉じで実行してください。')
            return None
        except Exception as err:
            logger.error(err)

    else:  # 手動取得

        try:
            path = os.getcwd() + '/' + driver_path
            driver = Chrome(executable_path=path, options=options)
        except InvalidArgumentException as err:
            logger.error(err)
            logger.error('既存のブラウザを閉じで実行してください。')
            return None
        except WebDriverException as err:
            logger.error(err)
            logger.error('Chromeと同じバージョンのChrome Driverをダウンロードしてください。')
            return None

    return driver
            paperinfo['corppersons'] = ''
            for j in paperco:
                if j != paperco[-1]:
                    j_text = j.text + ', '
                else:
                    j_text = j.text
                paperinfo['corppersons'] += j_text
            print(paperinfo)
            paperinfolist.append(paperinfo)
            mainplist.append('')
        except:
            pass

    return [
        info_field, info_cited, info_achi, info_h, info_g, '', '', '',
        per_json, namelist, paperinfolist, mainplist, ''
    ]


if __name__ == '__main__':
    # 使用option
    option = ChromeOptions()  # 创建配置示例
    option.add_argument('--headless')  # 无头模式,后台启动
    # 创建浏览器
    browser = Chrome()  # (options=option)
    Url = 'https://www.researchgate.net/profile/Jodie_Abbatangelo-Gray2'
    print('开始爬取')
    info = spid(Url, browser)
    print('爬取完毕:\ninfo:\n', info)
    # browser.quit()
예제 #22
0
def get_browser(headless: bool = True, browser_class: int = 1) -> Firefox:
    """
    获取一个浏览器
    :param headless:
    :param browser_class: 浏览器种类,0是谷歌, 1 是火狐, 服务器端不能使用谷歌
    :return:
    """
    """
    firefox的headless浏览器
    因为headless的浏览器的语言跟随操作系统,为了保证爬回来的数据是正确的语言,
    这里必须设置浏览器的初始化参数,
    注意,使用headless必须先安装对应浏览器正常的版本,然后再安装headless版本
    比如火狐的headless
    下载火狐的geckodriver驱动。(当前文件夹下已经有一个了)地址是:
    https://github.com/mozilla/geckodriver/releases
    下载后解压是一个geckodriver 文件。拷贝到/usr/local/bin目录下,然后加上可执行的权限
    sudo chmod +x /usr/local/bin/geckodriver
    chrome的headless浏览器
    https://chromedriver.storage.googleapis.com/index.html?path=2.35/
    你也可以自行搜索chromedriver的下载地址,解压是个可执行文件,放到chrome的目录即可.
    一般ubuntu下面,chrome的目录是/opt/google/chrome/
    据说使用root权限运行的话,chrome的headless浏览器会报异常.而firefox的headless浏览器不会!
    """
    if browser_class == 1:
        profile = FirefoxProfile()
        profile.set_preference("intl.accept_languages", "zh-cn")
        options = FirefoxOptions()
        options.add_argument("--headless")
        if headless:
            try:
                browser = Firefox(firefox_profile=profile,
                                  executable_path=firefox_driver,
                                  firefox_options=options)
            except Exception as e:
                title = "{} Firefox headless浏览器打开失败".format(
                    datetime.datetime.now())
                content = "错误原因是:{}".format(e)
                send_mail(title=title, content=content)
                logger.exception(e)
                raise e
        else:
            try:
                browser = Firefox(
                    firefox_profile=profile,
                    executable_path=firefox_driver,
                )
            except Exception as e:
                title = "{} Firefox headless浏览器打开失败".format(
                    datetime.datetime.now())
                content = "错误原因是:{}".format(e)
                send_mail(title=title, content=content)
                logger.exception(e)
                raise e
    else:
        options = ChromeOptions()
        options.add_experimental_option("excludeSwitches",
                                        ["ignore-certificate-errors"])
        if headless:
            options.add_argument("--headless")
            try:
                browser = Chrome(executable_path=chrome_driver,
                                 chrome_options=options)
            except Exception as e:
                title = "{} Chrome headless浏览器打开失败".format(
                    datetime.datetime.now())
                content = "错误原因是:{}".format(e)
                send_mail(title=title, content=content)
                logger.exception(e)
                raise e
        else:
            try:
                browser = Chrome(executable_path=chrome_driver,
                                 chrome_options=options)
            except Exception as e:
                title = "{} Chrome headless浏览器打开失败".format(
                    datetime.datetime.now())
                content = "错误原因是:{}".format(e)
                send_mail(title=title, content=content)  # 这是我自定义的方法
                logger.exception(e)
                raise e
    return browser
예제 #23
0
class Profiler:
    def __init__(self):
        self.chrome_options = ChromeOptions()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--window-size=1920x1080")
        self.chrome_options.add_argument("--log-level=3")

        # Use the chrome driver in the same directory as this file, regardless
        # of what the current working directory is.
        filepath = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
        self.chrome_driver = filepath + "/chromedriver_win.exe"

        self.driver = webdriver.Chrome(options=self.chrome_options,
                                executable_path=self.chrome_driver)

    """Dispose of the driver window correctly when code exits"""
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        self.driver.close()

    def query_token_sniffer(self, address):
        #TODO: Refactor this to a simple HTTP request
        url = "https://tokensniffer.com/token/" + address
        self.driver.get(url)
        sleep(1)

        if "WARNING" in self.driver.page_source:
            return "SCAM"
        if "This page could not be found" in self.driver.page_source:
            return "404"
        return "OKAY"


    def query_poocoin(self,address):
        #Direct driver to Poocoin URL
        url = 'https://poocoin.app/tokens/' + address
        self.driver.get(url)
        # Await page load by querying a specific element
        max_delay = 10
        try:
            myElem = WebDriverWait(self.driver, max_delay).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'px-3')))
        except TimeoutException:
            print("Loading took too much time!")
            return {
                "sell_exists": False,
                "v1_lp_address": "",
                "v2_lp_address": "",
                "v1_bnb_holdings": 0,
                "v2_bnb_holdings": 0,
                "market_cap": "$0"
            }
        sleep(1)

        #Get links to BSCScan for Liquidity Providers
        v1_lp_address = self.driver.find_element_by_xpath(
            "//*[@id='root']/div/div[1]/div[2]/div/div[1]/div[2]/small/a[2]"
        ).get_attribute('href')

        v2_lp_address = self.driver.find_element_by_xpath(
            "//*[@id='root']/div/div[1]/div[2]/div/div[1]/div[2]/small/a[4]"
        ).get_attribute('href')

        #Disgustingly parse out the BNB holdings in V1 and V2 LPs
        bnb_lp_values = self.driver.find_element_by_xpath("//*[@id='root']/div/div[1]/div[2]/div/div[1]/div[2]/small").text
        values = bnb_lp_values.split('BNB')
        v1_bnb = float(re.sub("[^0-9.]", "", values[0].replace("V1","")))
        v2_bnb = float(re.sub("[^0-9.]", "", values[1].replace("V2","").split(":")[1]))

        market_cap = self.driver.find_element_by_xpath("//*[@id='root']/div/div[1]/div[2]/div/div[1]/div[2]/span[1]").text

        #Determine if any Sell transactions have taken place
        try:
            tx_table = self.driver.find_element_by_xpath(
                "//*[@id='root']/div/div[1]/div[2]/div/div[2]/div[2]/div/div[3]/div[1]/div/div[2]")
            sell_txs = bool(tx_table.text.count("Sell"))
        except:
            sell_txs = False

        return {
            "sell_exists": sell_txs,
            "v1_lp_address": v1_lp_address,
            "v2_lp_address": v2_lp_address,
            "v1_bnb_holdings": v1_bnb,
            "v2_bnb_holdings": v2_bnb,
            "market_cap": market_cap
        }

    def query_bscscan_token(self, address):
         # Direct driver to given token URL
        url = 'https://bscscan.com/token/' + address
        self.driver.get(url)
        # Await page load by querying a specific element
        max_delay = 25
        try:
            myElem = WebDriverWait(self.driver, max_delay).until(
                EC.presence_of_element_located((By.ID, 'totaltxns')))
        except TimeoutException:
            print("FAIL - Loading took too much time!")
            return {
            "num_transactions": 0,
            "num_holders" : 0,
            "age" : datetime.now(),
            "tx_df": pd.DataFrame(),
        }
        sleep(0.5)

        # Extract total number of transactions
        transactions = self.driver.find_element_by_id("totaltxns").text
        num_transactions = int(re.sub("[^0-9]", "", transactions))

        # Extract number of token holders
        holders = self.driver.find_element_by_class_name("mr-3").text
        num_holders = int(re.sub("[^0-9]", "", holders))

        # Focus TX table
        WebDriverWait(self.driver, 15).until(EC.frame_to_be_available_and_switch_to_it(
            (By.XPATH, "//*[@id='tokentxnsiframe']")))

        age_col = self.driver.find_element_by_xpath("//*[@id='lnkTokenTxnsAgeDateTime']").text
        if age_col == "Age":
            # Switch DateTime format
            age_elem = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='lnkTokenTxnsAgeDateTime']"))).click()

        # Select Last Page of TXs (may not exist if 1 page only)
        try:
            WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='maindiv']/div[1]/nav/ul/li[5]/a/span[1]"))).click()
        except:
            pass
        # Parse raw HTML with BeautifulSoup
        soup = BeautifulSoup(self.driver.page_source, features="html.parser")

        # Scrape HTML table
        table_data = soup.find(
            "table", {"class": "table table-md-text-normal table-hover mb-4"})
        tx_df = pd.read_html(str(table_data))[0]
        tx_df.dropna(axis=1, how='all', inplace=True)

        #Get Age of token (first tx datetime)
        # print(df)
        tx_df["Date Time (UTC)"] = pd.to_datetime(tx_df["Date Time (UTC)"])
        earliest_tx = tx_df["Date Time (UTC)"].min()

        #TODO: Hunt for Whales
        #Switch to Holders table tab
        # self.driver.get(url+"#balances")

        # WebDriverWait(self.driver, 25).until(EC.frame_to_be_available_and_switch_to_it(
        #     (By.XPATH, "//*[@id='tokeholdersiframe']")))

        # WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH,"//*[@id='ContentPlaceHolder1_tabHolders']"))).click()

        # Focus Holders table
        # WebDriverWait(self.driver, 15).until(EC.frame_to_be_available_and_switch_to_it(
        #     (By.XPATH, "//*[@id='tokeholdersiframe']")))
        # sleep(5)

        # #Holy shit this is gross
        # #Find contract icon by <i> -> <span> -> <td> -> <tr> -> <td>rowKey</td>
        # icons = self.driver.find_elements_by_class_name("fa-file-alt")
        # icons = [i.find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..')
        #         .get_attribute('innerHTML')[:10] for i in icons]
        # #Then parse out the <td></td> HTML tags to get the row number
        # contract_rows = [int(re.sub("[^0-9]", "", i))-1 for i in icons]

        # # Parse raw HTML with BeautifulSoup
        # soup = BeautifulSoup(self.driver.page_source, features="html.parser")

        # # Scrape HTML table
        # table_data = soup.find(
        #     "table", {"class": "table table-md-text-normal table-hover"})
        # holders_df = pd.read_html(str(table_data))[0]
        # holders_df.dropna(axis=1, how='all', inplace=True)

        # # Boolean for IsContractAddress, indicated by the icon on BSCscan
        # holders_df["is_contract_address"] = False
        # holders_df.loc[contract_rows, "is_contract_address"] = True
        # "holders_df": holders_df

        return {
            "num_transactions": num_transactions,
            "num_holders" : num_holders,
            "age" : earliest_tx,
            "tx_df": tx_df,
        }

    def query_bscscan_liquidity_providers(self, url):
         # Direct driver to given LP URL
        self.driver.get(url)
        # Await page load by querying a specific element
        max_delay = 25
        try:
            myElem = WebDriverWait(self.driver, max_delay).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'mr-3')))
        except TimeoutException:
            return pd.DataFrame

        sleep(1)

        # Extract number of token holders
        holders = self.driver.find_element_by_class_name("mr-3").text
        num_lp_holders = int(re.sub("[^0-9]", "", holders))

        # Focus holders table
        WebDriverWait(self.driver, 15).until(EC.frame_to_be_available_and_switch_to_it(
            (By.XPATH, "//*[@id='tokeholdersiframe']")))
        sleep(1)

        #Holy shit this is gross
        #Find contract icon by <i> -> <span> -> <td> -> <tr> -> <td>rowKey</td>
        icons = self.driver.find_elements_by_class_name("fa-file-alt")
        icons = [i.find_element_by_xpath('..').find_element_by_xpath('..').find_element_by_xpath('..')
                .get_attribute('innerHTML')[:10] for i in icons]
        #Then parse out the <td></td> HTML tags to get the row number
        contract_rows = [int(re.sub("[^0-9]", "", i))-1 for i in icons]

        # Parse raw HTML with BeautifulSoup
        soup = BeautifulSoup(self.driver.page_source, features="html.parser")

        # Scrape HTML table
        table_data = soup.find(
            "table", {"class": "table table-md-text-normal table-hover"})
        df = pd.read_html(str(table_data))[0]
        df.dropna(axis=1, how='all', inplace=True)

        # Boolean for IsContractAddress, indicated by the icon on BSCscan
        df["is_contract_address"] = False
        df.loc[contract_rows, "is_contract_address"] = True

        #Stick on the number of holders lol
        df["num_lp_holders"] = num_lp_holders

        return df

    def profile_token(self, address):
        #Start by querying Poocoin to get BSCScan LP links
        poocoin_stats = self.query_poocoin(address)
        # TODO: Exit early if no liquidity
        if poocoin_stats["v1_bnb_holdings"] < 1 and poocoin_stats["v2_bnb_holdings"] < 1:
            poocoin_stats["locked_liquidity"] = 0
            poocoin_stats["tx_df"] = pd.DataFrame()
            poocoin_stats["stats"] = { "age": pd.Timestamp.now()}
            poocoin_stats["token_sniffer"] = "404"

            return poocoin_stats

        #Query token on BSCScan
        bscscan_stats = self.query_bscscan_token(address)

        # Query Liquidity Provider holders on BSCScan
        # [Rank, Address, Quantity, Percentage, is_contract_address]
        v1_lp_holders = self.query_bscscan_liquidity_providers(poocoin_stats["v1_lp_address"])
        v2_lp_holders = self.query_bscscan_liquidity_providers(poocoin_stats["v2_lp_address"])

        def check_locked_liquidty(df, liquidity_value):
            if "There are no matching entries" == df["Percentage"].iloc[0]:
                return 0
            #Find real value of liquidty per address (in BNB)
            df["percent_float"] = df["Percentage"].apply(lambda x: float(''.join(i for i in x if i not in '%,'))/100)
            df = df[df["percent_float"] <= 100]
            df["bnb_value"] = df["percent_float"] * liquidity_value

            total_locked = 0
            # Check if liquidity is sufficient + locked
            dead_address = "0x000000000000000000000000000000000000dead"
            if dead_address in df["Address"]:
                total_locked += sum(df[df["Address"]==dead_address]["bnb_value"])

            # contract_addresses = df[df["is_contract_address"]==True]
            # total_locked += sum(contract_addresses["bnb_value"])

            return total_locked

        #Calculate locked liquidity
        total_locked = 0
        if not v1_lp_holders.empty:
            total_locked += check_locked_liquidty(v1_lp_holders, poocoin_stats["v1_bnb_holdings"])
        if not v2_lp_holders.empty:
            total_locked+= check_locked_liquidty(v2_lp_holders, poocoin_stats["v2_bnb_holdings"])

        # Return full dictionary
        profile = poocoin_stats
        profile['v1_lp_holders'] = v1_lp_holders
        profile['v2_lp_holders'] = v2_lp_holders
        profile['stats'] = bscscan_stats
        profile['token_sniffer'] = self.query_token_sniffer(address)
        profile['locked_liquidity'] = total_locked
        return profile
예제 #24
0
import random
from selenium.webdriver.chrome.options import Options

import requests

chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#防止被屏蔽
from selenium.webdriver import ChromeOptions

option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])

option.add_experimental_option('useAutomationExtension', False)
option.add_argument(
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
)

bro = webdriver.Chrome()
with open('C:\\Users\Administrator\Desktop/stealth.min.js') as f:
    js = f.read()

bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})

bro.execute_cdp_cmd(
    "Page.addScriptToEvaluateOnNewDocument", {
예제 #25
0
 def _pwd_login(self) -> bool:
     res = False
     try:
         if platform == 'linux' or platform == 'linux2':
             driver_path = webdriver_path_debian
         elif platform == 'win32':
             driver_path = webdriver_path_win
         chrome_options = ChromeOptions()
         chrome_options.add_argument('--headless')
         chrome_options.add_argument('--disable-gpu')
         chrome_options.add_argument('blink-settings=imagesEnabled=false')
         chrome_options.add_argument('--no-sandbox')
         chrome_options.add_argument('--disable-gpu')
         chrome_options.add_argument('--disable-dev-shm-usagenmsbsohu123')
         driver = webdriver.Chrome(options=chrome_options, executable_path=driver_path)
         driver.get('https://mail.aliyun.com/alimail/auth/login')
         wait = WebDriverWait(driver, 10)
         wait.until(lambda diver: driver.find_element_by_xpath('//iframe[@id="alibaba-login-box"]'),
                    message='load login page fail!')
         driver.switch_to.frame('alibaba-login-box')
         account = self.task.account.split('@')[0]
         driver.find_element_by_xpath('//input[@id="fm-login-id"]').send_keys(account)
         driver.find_element_by_xpath('//input[@id="fm-login-password"]').send_keys(self.task.password)
         driver.find_element_by_xpath('//input[@id="fm-login-submit"]').click()
         wait.until(lambda diver: driver.find_element_by_xpath("//*[text()='我的邮箱']"),
                    message='enter homepage fail!')
         cookies = ''
         for cookie in driver.get_cookies():
             cookies = cookies + cookie['name'] + '=' + cookie['value'] + ';'
         driver.quit()
         self._ha._managedCookie.add_cookies('aliyun.com', cookies)
         res = self._cookie_login()
     except Exception as ex:
         self._logger.error("Pwd login error, err: {}".format(ex))
         self._write_log_back("账密登录失败: {}".format(ex.args))
     return res
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import math
import time


def calc(x):
    return str(math.log(abs(12 * math.sin(int(x)))))


try:
    link = "http://suninjuly.github.io/explicit_wait2.html"
    options = ChromeOptions()
    options.add_argument("--start-maximized")
    browser = webdriver.Chrome(options=options)
    browser.get(link)

    # говорим Selenium проверять в течение 5 секунд, пока кнопка не станет кликабельной
    price = WebDriverWait(browser, 12).until(
        EC.text_to_be_present_in_element((By.XPATH, "//*[@id='price']"), "100")
    )

    button = browser.find_element_by_xpath("//*[@id='book']")
    button.click()

    x_element = browser.find_element_by_xpath("//*[@id='input_value']")
    x = x_element.text
예제 #27
0
    def _start_browser(self):
        assert self.browser is None, "Browser must not exist in order to call _start_browser!"

        # Load a user profile from normal chrome
        user_profile = "C:\\Users\\Alex Thiel\\AppData\\Local\\Google\\Chrome\\User Data\\Default"

        # Options
        options = Options()
        options.add_argument("user-data-dir={}".format(user_profile))
        options.add_experimental_option("excludeSwitches", [
            "ignore-certificate-errors",
            "safebrowsing-disable-download-protection",
            "safebrowsing-disable-auto-update",
            "disable-client-side-phishing-detection"
        ])
        os.environ["webdriver.chrome.driver"] = self.driver_path

        # Add variation to the browser
        if randint(0, 2) == 1:
            options.add_argument("--incognito")
            print("Option: Incognito")
        if randint(0, 2) == 1:
            options.add_argument("--disable-extensions")
            print("Option: Disabling Extensions")
        if randint(0, 2) == 1:
            options.add_argument("--disable-plugins-discovery")
            print("Option: Disabling plugins discovery")
        if randint(0, 2) == 1:
            options.add_argument('--no-referrers')
            print("Option: No Referrers")
        if randint(0, 2) == 1:
            options.add_argument('--disable-web-security')
            print("Option: Disabled web security")
        if randint(0, 2) == 1:
            options.add_argument('--allow-running-insecure-content')
            print("Option: Allowing running insecure content")
        if randint(0, 2) == 1:
            options.add_experimental_option(
                'prefs', {
                    'credentials_enable_service': False,
                    'profile': {
                        'password_manager_enabled': False
                    }
                })
            print("Options: Disabled Password Manager")

        # options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})

        agent = UserAgent().random
        options.add_argument("user-agent=" + agent)
        self.current_agent = agent
        print("Option: Agent:", agent)

        # Open up browser window
        self.browser = Driver(executable_path=self.driver_path,
                              chrome_options=options)
        self.browser.set_page_load_timeout(self.cfg.browser_timeout)
        self.browser.delete_all_cookies()

        if randint(0, 2) == 1:
            print("Option: Start Maximized")
            self.browser.maximize_window()
        else:
            self.browser.set_window_size(randint(700, 1080),
                                         randint(700, 1080))
            self.browser.set_window_position(randint(0, 300), randint(0, 300))
예제 #28
0
 def get_driver(self, name='chrome', type='headless'):
     # todo:内存泄漏问题;各个浏览器配置管理
     self._instance_lock.acquire()
     if name in self._driver.keys():
         return self._driver[name]
     self._instance_lock.release()
     deploy_home = ConfigInit().get_conf().get('DEFAULT', 'deploy_home')
     if name == 'phantomjs':
         dcap = dict(DesiredCapabilities.PHANTOMJS)
         dcap["phantomjs.page.settings.userAgent"] = (random.choice(
             consts.USER_AGENTS))
         dcap["phantomjs.page.settings.loadImages"] = False
         driver_phantomjs = webdriver.PhantomJS(
             desired_capabilities=dcap,
             executable_path=deploy_home + '/src/config/phantomjs')
         self._driver[name] = driver_phantomjs
         return driver_phantomjs
     elif name == 'chrome':
         opts = ChromeOptions()
         opts.add_argument('--no-sandbox')
         opts.add_argument('--disable-dev-shm-usage')
         dcap = dict(DesiredCapabilities.CHROME)
         dcap["chrome.page.settings.loadImages"] = False
         if type == 'headless':
             opts.add_argument("--headless")
         chrome_driver = webdriver.Chrome(
             desired_capabilities=dcap,
             executable_path=deploy_home +
             ConfigInit().get_config_by_option('chrome_path'),
             chrome_options=opts)
         self._driver[name] = chrome_driver
         return chrome_driver
     elif name == 'firefox':
         opts = FirefoxOptions()
         if type == 'headless':
             opts.add_argument("--headless")
         firefox_driver = webdriver.Firefox(executable_path=deploy_home +
                                            '/src/config/geckodriver_mac',
                                            firefox_options=opts)
         self._driver[name] = firefox_driver
         return firefox_driver
예제 #29
0
from selenium.webdriver import Chrome, ChromeOptions
import time
import pymysql
import sys
import spider_foreign_sl
from time import *

option = ChromeOptions()  # 创建配置实例
option.add_argument('--headless')  #在后台启动
option.add_argument('--no-sandbox')
option.add_argument('--disable-dev-shm-usage')
option.add_argument('blink-settings=imagesEnabled=false')
option.add_argument('--disable-gpu')

# 创建浏览器
schoolbrowser = Chrome(executable_path="/home/baize/Chrome/chromedriver",
                       options=option)  # (options=option)
#给出待爬取的学校url
schoolUrl = 'https://www.researchgate.net/institution/University_of_Chicago/departments'
schoolbrowser.get(schoolUrl)  # 打开具体的网页
sleep(2)
#链接数据库
conn = pymysql.connect(host="39.106.96.175",
                       port=3306,
                       db="scholar_info",
                       user="******",
                       password="******",
                       charset="utf8")
cls = conn.cursor()
schoolget = schoolUrl.split("/")[-2].replace('%20', '_')
# 按学校建表
예제 #30
0
import time
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select

url = "http://www.calculator.net/interest-calculator.html"
chrome_path = '/home/xuananh/data/Downloads/chromedriver_linux64/chromedriver'
# chrome driver download from link: https://chromedriver.storage.googleapis.com/index.html?path=2.44/

chrome_option = ChromeOptions()
driver = Chrome(executable_path=chrome_path, chrome_options=chrome_option)
driver.maximize_window()
driver.get(url)

# selecting an item from Drop Down list Box
drop_down = Select(driver.find_element_by_id("ccompound"))
drop_down.select_by_visible_text("monthly")

# cach 2 de chon 1 item
# driver.find_element_by_id("ccompound").click()
# driver.find_element_by_xpath('//option[@value="monthly"]').click()

# ban cung co the su dung cac phuong thuc sau:
# drop_down.select_by_index(1)
# drop_down.select_by_value("continuously")

print('Is Selected:  ', driver.find_element_by_id('ccompound').is_selected())
print('Is Enabled:   ', driver.find_element_by_id('ccompound').is_enabled())
print('Is Displayed: ', driver.find_element_by_id('ccompound').is_displayed())
print('text:         ', drop_down.first_selected_option.text)
예제 #31
0
def get_amzn_driver(email, password, headless=False, session_path=None):
    zip_type = ""
    executable_path = os.path.join(os.getcwd(), 'chromedriver')
    if _platform in ['win32', 'win64']:
        executable_path += '.exe'

    zip_type = CHROME_ZIP_TYPES.get(_platform)

    if not os.path.exists(executable_path):
        zip_file_url = CHROME_DRIVER_BASE_URL.format(
            CHROME_DRIVER_VERSION, zip_type)
        request = requests.get(zip_file_url)

        if request.status_code != 200:
            raise RuntimeError(
                'Error finding chromedriver at {}, status = {}'.format(
                    zip_file_url, request.status_code))

        zip_file = zipfile.ZipFile(io.BytesIO(request.content))
        zip_file.extractall()
        os.chmod(executable_path, 0o755)

    chrome_options = ChromeOptions()
    if headless:
        chrome_options.add_argument('headless')
        chrome_options.add_argument('no-sandbox')
        chrome_options.add_argument('disable-dev-shm-usage')
        chrome_options.add_argument('disable-gpu')
        # chrome_options.add_argument("--window-size=1920x1080")
    if session_path is not None:
        chrome_options.add_argument("user-data-dir=" + session_path)

    logger.info('Logging into Amazon.com')

    driver = Chrome(chrome_options=chrome_options,
                    executable_path=executable_path)

    driver.get(ORDER_HISTORY_URL_VIA_SWITCH_ACCOUNT_LOGIN)

    driver.implicitly_wait(2)

    def get_element_by_id(driver, id):
        try:
            return driver.find_element_by_id(id)
        except NoSuchElementException:
            pass
        return None

    def get_element_by_xpath(driver, xpath):
        try:
            return driver.find_element_by_xpath(xpath)
        except NoSuchElementException:
            pass
        return None

    # Go straight to the account switcher, and look for the given email.
    # If present, click on it! Otherwise, click on "Add account".
    desired_account_element = get_element_by_xpath(
        driver,
        "//div[contains(text(), '{}')]".format(email))
    if desired_account_element:
        desired_account_element.click()
        driver.implicitly_wait(2)

        # It's possible this account has already authed recently. If so, the
        # next block will be skipped and the login is complete!
        if not get_element_by_id(driver, 'report-confirm'):
            driver.find_element_by_id('ap_password').send_keys(
                get_password(password))
            driver.find_element_by_name('rememberMe').click()
            driver.find_element_by_id('signInSubmit').submit()
    else:
        # Cannot find the desired account in the switch. Log in via Add Account
        driver.find_element_by_xpath(
            '//div[text()="Add account"]').click()
        driver.implicitly_wait(2)

        driver.find_element_by_id('ap_email').send_keys(email)

        # Login flow sometimes asks just for the email, then a
        # continue button, then password.
        if get_element_by_id(driver, 'continue'):
            driver.find_element_by_id('continue').click()
            driver.implicitly_wait(2)

        driver.find_element_by_id('ap_password').send_keys(
            get_password(password))
        driver.find_element_by_name('rememberMe').click()
        driver.find_element_by_id('signInSubmit').submit()

    driver.implicitly_wait(2)

    if not get_element_by_id(driver, 'report-confirm'):
        logger.warning('Having trouble logging into Amazon. Please see the '
                       'browser and complete login within the next 5 minutes. '
                       'This script will continue automatically on success. '
                       'You may need to manually navigate to: {}'.format(
                           ORDER_HISTORY_REPORT_URL))
        if get_element_by_id(driver, 'auth-mfa-otpcode'):
            logger.warning('Hint: Looks like an auth challenge! Maybe check '
                           'your email')
    try:
        wait_cond = EC.presence_of_element_located((By.ID, 'report-confirm'))
        WebDriverWait(driver, 60 * 5).until(wait_cond)
    except TimeoutException:
        logger.critical('Cannot complete login!')
        exit(1)

    return driver
예제 #32
0
def spiderweibo(keywords, timepara, province, city, district, locpara,
                resultfile):
    # 设置浏览器自动登陆微博
    option = ChromeOptions()
    prefs = {
        'profile.default_content_setting_values': {
            'images': 2,
            'javascript': 1
        }
    }  #禁止加载图片与JS
    option.add_experimental_option('prefs', prefs)
    #option.add_argument('--headless')#不显示浏览器窗口
    #options.headless = True
    #option.add_argument('--disable-gpu') #for windows not for macos
    option.add_argument(
        r"user-data-dir=/Users/Yokimsu/Library/Caches/Google/Chrome/User Data/Profile 1"
    )  # 浏览器缓存位置
    browser = Chrome("/Users/YokimSu/chromedriver", 0, options=option)

    url0 = "https://s.weibo.com/weibo?q=" + keywords + locpara + "&typeall=1&suball=1&timescope=custom:" + timepara

    #browser.minimize_window()#最小化窗口
    browser.get(url0)
    time.sleep(3)

    soup = BeautifulSoup(browser.page_source, 'lxml')
    loginYN = ""
    try:
        loginYN = soup.find('div', {
            'class': 'm-hint'
        }).find('a', {'action-type': 'login'})
        loginYN = "N"
        print("未登录,程序终止,请重新登录后在操作!")
    except:
        pass

    try:
        pagenum = len(soup.find('ul', {'class': 's-scroll'}).find_all('li'))
        #print(pagenum)
        print("-------------------------")
        print("总页数%d" % pagenum)
        # 如果等于50页,有可能超出微博显示数量,后期加上省份选项,重新爬取
        if pagenum == 50:
            with open('超过50页.csv', 'a', encoding='utf8', newline='') as f:
                current = time.time()
                current = time.localtime(current)
                timestr = "" + str(current.tm_year) + "/" + str(
                    current.tm_mon) + "/" + str(current.tm_mday) + " " + str(
                        current.tm_hour) + ":" + str(
                            current.tm_min) + ":" + str(current.tm_sec)
                writer = csv.writer(f)
                writer.writerow((timestr, pagenum, url0, keywords, timepara,
                                 province, city, district, locpara))
    except:  # 只有一页是会报错
        pagenum = 1
        print("-------------------------")
        print("总页数%d" % pagenum)

    #判断是否有相关内容
    yn = ""
    try:
        yn = soup.find('div', {
            'class': 'card card-no-result s-pt20b40'
        }).find('p').get_text().strip()
        pagenum = 1
        #此段是将没有搜索结果的参数也写入csv结果文件中,方便查看哪些字段信息搜索无结果
        """
        with open(resultfile, 'a', encoding='utf8', newline='') as f:
            current = time.time()
            current = time.localtime(current)
            timestr = "" + str(current.tm_year) + "/" + str(current.tm_mon) + "/" + str(current.tm_mday) + " " + str(current.tm_hour) + ":" + str(current.tm_min) + ":" + str(current.tm_sec)
            writer = csv.writer(f)
            writer.writerow((timestr, url0,timepara, province,city,district,yn,"", "","","", "", "", "",
                             "","","", "","","","","","","","","","",""))
        """
        print(url0 + "----此页无结果!")
    except:
        yn = ""
        if pagenum <= 50:
            for page in range(0, pagenum):
                try:
                    # 用浏览器打开微博话题页面
                    url = url0 + "&Refer=g&page=" + str(page + 1)
                    browser.get(url)
                    #只有时间筛选项的url
                    """
                    browser.get(
                        "https://s.weibo.com/weibo?q=" + keywords + "&typeall=1&suball=1&timescope=custom:" + timepara
                        + "&Refer=g&page=" + str(page + 1))
                    """
                    time.sleep(1)
                    # 调到网页内容
                    browser.switch_to_default_content()
                    # 读取浏览器网页内容
                    soup = BeautifulSoup(browser.page_source, 'lxml')
                    # 每个用户的信息及评论存放在此列表里的每个元素里
                    allinfo = soup.find_all('div', {'class': 'card'})

                    for eachitem in allinfo:
                        if eachitem.find('a', {'class': 'name'}):
                            # 用户名信息
                            try:
                                username = eachitem.find(
                                    'a', {
                                        'class': 'name'
                                    }).get_text().strip()  # username
                            except:
                                username = "******"

                            # 用户主页
                            try:
                                userlink = "https://www." + eachitem.find(
                                    'a', {
                                        'class': 'name'
                                    }).get('href').replace(
                                        "/member/", "").replace(
                                            "//", "").strip()  # 用户主页
                            except:
                                userlink = ("None")
                            # 微博内容
                            try:
                                contents = eachitem.find(
                                    'p', {
                                        'class': 'txt'
                                    }).get_text().strip()  # 微博内容
                            except:
                                contents = "None"
                            try:
                                location_temp = len(
                                    eachitem.find('p', {
                                        'class': 'txt'
                                    }).find_all('a'))
                            except:
                                location_temp = 1
                            try:
                                location = eachitem.find(
                                    'p', {
                                        'class': 'txt'
                                    }).find_all('a')[location_temp -
                                                     1].get_text().replace(
                                                         "2", "").strip()  # 定位
                                location_link = eachitem.find(
                                    'p', {
                                        'class': 'txt'
                                    }).find_all('a')[location_temp - 1].get(
                                        'href').strip()  # 定位
                            except:
                                location = "None"
                                location_link = "None"
                            #print(location,location_link)
                            # 判断转发来源,此段暂时不需要
                            """
                            try:
                                pattern = re.compile(r'//@.*?:', re.I)
                                resulttemp = pattern.findall(contents)
                                source = resulttemp[0]
                                source = source.replace("//@", "").replace(":", "")
                                #print(source)
                            except:
                                source = "None"
                            if source == "None":
                                try:
                                    source = eachitem.find('div', {'class': 'card-comment'}).find('div', {
                                        'node-type': 'feed_list_forwardContent'}).find('a').get_text().strip()  # 转发来源
                                except:
                                    pass
                            """
                            # 发表时间
                            try:
                                tempnum = len(
                                    eachitem.find_all('p', {'class': 'from'}))
                            except:
                                tempnum = 1
                            if tempnum > 1:
                                try:
                                    post_date = eachitem.find_all(
                                        'p', {'class': 'from'})[1].find_all(
                                            "a")[0].get_text().strip()  # 时间
                                    contents_link = "https://www." + eachitem.find_all(
                                        'p', {'class': 'from'
                                              })[1].find_all("a")[0].get(
                                                  'href').strip().replace(
                                                      "//", "")
                                    try:
                                        post_date_date = post_date.split(
                                            " ",
                                            1)[0].replace("年", "/").replace(
                                                "月", "/").replace("日", "")
                                        post_date_time = post_date.split(
                                            " ", 1)[1]
                                    except:
                                        post_date_date = "None"
                                        post_date_time = "None"
                                except:
                                    post_date = "None"
                                    contents_link = "None"
                                    post_date_date = "None"
                                    post_date_time = "None"
                            else:
                                try:
                                    post_date = eachitem.find_all(
                                        'p', {'class': 'from'})[0].find_all(
                                            "a")[0].get_text().strip()  # 时间
                                    contents_link = "https://www." + eachitem.find_all(
                                        'p', {'class': 'from'
                                              })[0].find_all("a")[0].get(
                                                  'href').strip().replace(
                                                      "//", "")
                                    try:
                                        post_date_date = post_date.split(
                                            " ",
                                            1)[0].replace("年", "/").replace(
                                                "月", "/").replace("日", "")
                                        post_date_time = post_date.split(
                                            " ", 1)[1]
                                    except:
                                        post_date_date = "None"
                                        post_date_time = "None"
                                except:
                                    post_date = "None"
                                    contents_link = "None"
                                    post_date_date = "None"
                                    post_date_time = "None"

                            if post_date.find("今天") > -1:
                                continue
                            # 发布来源
                            try:
                                pingtai = "【来自】" + eachitem.find(
                                    'p', {
                                        'class': 'from'
                                    }).find_all(
                                        "a")[1].get_text().strip()  # 来自
                            except:
                                pingtai = "None"

                            # 收藏数
                            try:
                                favorite_num = eachitem.find(
                                    'div', {
                                        'class': 'card-act'
                                    }).find_all("a")[0].get_text().replace(
                                        "收藏", "").strip()
                            except:
                                favorite_num = "None"
                            # 转发数
                            try:
                                repost_num = eachitem.find(
                                    'div', {
                                        'class': 'card-act'
                                    }).find_all("a")[1].get_text().replace(
                                        "转发", "").strip()
                            except:
                                repost_num = "None"
                            # 评论数
                            try:
                                comments_num = eachitem.find(
                                    'div', {
                                        'class': 'card-act'
                                    }).find_all("a")[2].get_text().replace(
                                        "评论", "").strip()
                            except:
                                comments_num = "None"
                            # 点赞数
                            try:
                                reward_num = eachitem.find(
                                    'div', {
                                        'class': 'card-act'
                                    }).find_all("a")[3].get_text().strip()
                            except:
                                reward_num = "None"

                            #解析转发的原微博内容
                            try:
                                username2 = eachitem.find(
                                    'div', {
                                        'class': 'card-comment'
                                    }).find('a').get('nick-name').strip()  #用户名
                            except:
                                username2 = "None"
                            try:
                                userlink2 = "https://www." + eachitem.find(
                                    'div', {
                                        'class': 'card-comment'
                                    }).find('a').get('href').strip().replace(
                                        "//", "")  #用户主页链接
                            except:
                                userlink2 = "None"
                            #print(username2,userlink2)
                            try:
                                contents2 = eachitem.find(
                                    'div', {
                                        'class': 'card-comment'
                                    }).find('p', {
                                        'class': 'txt'
                                    }).get_text().strip()  #微博内容
                            except:
                                contents2 = "None"
                            try:
                                post_date2 = eachitem.find(
                                    'div', {
                                        'class': 'card-comment'
                                    }).find('p', {
                                        'class': 'from'
                                    }).find_all(
                                        "a")[0].get_text().strip()  #发布时间
                            except:
                                post_date2 = "None"
                            try:
                                contents2_link = "https://www." + eachitem.find(
                                    'div', {
                                        'class': 'card-comment'
                                    }).find('p', {
                                        'class': 'from'
                                    }).find_all("a")[0].get(
                                        'href').strip().replace("//", "")
                            except:
                                contents2_link = "None"
                            try:
                                pingtai2 = "【来自】" + eachitem.find(
                                    'div', {
                                        'class': 'card-comment'
                                    }).find('p', {
                                        'class': 'from'
                                    }).find_all(
                                        "a")[1].get_text().strip()  # 来自
                            except:
                                pingtai2 = "None"
                            try:
                                tempinfo2 = eachitem.find(
                                    'div', {
                                        'class': 'card-comment'
                                    }).find('div', {
                                        'class': 'func'
                                    }).find('ul', {
                                        'class': 'act s-fr'
                                    }).find_all('li')
                            except:
                                tempinfo2 = "None"
                            try:
                                repost_num2 = tempinfo2[0].get_text().replace(
                                    "转发", "").strip()
                            except:
                                repost_num2 = "None"
                            try:
                                comments_num2 = tempinfo2[1].get_text(
                                ).replace("评论", "").strip()
                            except:
                                comments_num2 = "None"
                            try:
                                reward_num2 = tempinfo2[2].get_text().strip()
                            except:
                                reward_num2 = "None"

                            with open(resultfile,
                                      'a',
                                      encoding='utf8',
                                      newline='') as f:
                                current = time.time()
                                current = time.localtime(current)
                                timestr = "" + str(
                                    current.tm_year
                                ) + "/" + str(current.tm_mon) + "/" + str(
                                    current.tm_mday) + " " + str(
                                        current.tm_hour) + ":" + str(
                                            current.tm_min) + ":" + str(
                                                current.tm_sec)
                                writer = csv.writer(f)
                                writer.writerow(
                                    (timestr, url, timepara, province, city,
                                     district, yn, username, userlink,
                                     contents, location, location_link,
                                     post_date, post_date_date, post_date_time,
                                     pingtai, favorite_num, repost_num,
                                     comments_num, reward_num, contents_link,
                                     username2, userlink2, contents2,
                                     post_date2, pingtai2, repost_num2,
                                     comments_num2, reward_num2,
                                     contents2_link))

                except Exception as e:
                    print("error", str(e))
                print("----------")
                #print(page+1,pagenum,timepara)
                print("第%d/%d页爬取完成!时间参数:%s" % ((page + 1), pagenum, timepara))

    browser.quit()
    return loginYN
예제 #33
0
 def launch_application(browser_name, app_url):
     global driver
     log.info("in init method of selenium base")
     try:
         if browser_name == "chrome":
             option = ChromeOptions()
             option.add_argument("start-maximized")
             option.add_argument("--ignore-certificate-errors")
             option.add_argument("--disable-extensions")
             option.add_argument("--disable-infobars")
             option.add_argument("disable-notifications")
             driver = Chrome(executable_path="./drivers/chromedriver.exe",
                             options=option)
             log.info("chrome browser is launch successfully")
         elif browser_name == "firefox":
             profile = FirefoxProfile()
             profile.accept_untrusted_certs = True
             options = FirefoxOptions()
             options.add_argument("start-maximized")
             driver = Firefox(executable_path="./drivers/geckodriver.exe")
             log.info("firefox browser is launch successfully")
         elif browser_name == "ie":
             driver = Ie(executable_path="./drivers/IEDriverServer.exe")
         else:
             log.error("browser name is incorrect", browser_name)
     except WebDriverException:
         log.critical("exception", WebDriverException)
     driver.implicitly_wait(5)
     driver.get(app_url)
예제 #34
0
 def run(self):
     options = ChromeOptions()
     options.add_argument('--test-type')
     self.driver = Chrome(chrome_options=options)
     self.perform_steps()
     self.driver.close()
예제 #35
0
import sys
from time import sleep

from selenium import webdriver as drv
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from selenium.common.exceptions import NoSuchElementException, WebDriverException, TimeoutException

from config import conf

# init, talk about "eyes"
opts = ChromeOptions()
br = drv.Chrome(chrome_options=opts)
br.implicitly_wait(15)
conf["dom"] = br

###################################
###################################

from utils import get_xpath, wait_for_xpath, wait_for_path, \
    tracer, chat, err, gen_url, click_xpath, wait_for_xpath, \
    wait_for_url, log
from dom_tools import where_in_nextbox, goto_nextbox_nav, \
    goto_nextbox, nextbox_sub_ensure
from test_storages import storages_roundtrip, storages_backup_avail

import pysnooper

예제 #36
0
 def chromeoptions(self):
     options = ChromeOptions()
     options.add_argument('headless')
     options.add_argument('--log-level=3')
     options.add_argument('--disable-extensions')
     return options
예제 #37
0
파일: mogujie.py 프로젝트: D-Tens/MoGuJie
import pymongo
import time
import json
import random
import hashlib
from config import *

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
collec_mogu = db[COLLECTION_NAME]

headers = {
    'user-agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
}
opt = ChromeOptions()
opt.headless = True
browser = Chrome(options=opt)


def request_html(url):
    response = requests.get(url, headers=headers)
    html = etree.HTML(response.text)
    return html


def kind_ls(html):
    shop_ls = html.xpath(
        '//div[@class="item-wrap"]/div[1]/a[@class="cate-item-link"]/@href'
    )[:-1]
    for shop_url in shop_ls:
예제 #38
0
    line_notify_token = '1NS7LLBpds6tZRmSEIlBH0m5YhQ3aUQit9nfJd6KveW'
    line_notify_api = 'https://notify-api.line.me/api/notify'
    #変数messageに文字列をいれて送信。 トークン名の隣に文字が来てしまうので最初に改行
    message = '\n' + mes
    payload = {'message': message}
    headers = {'Authorization': 'Bearer ' + line_notify_token}
    line_notify = requests.post(line_notify_api, data=payload, headers=headers)


# 例外が生じた時に、自分に通知する
try:
    # 絶対パス取得
    cwd = os.getcwd()

    #webdriverを使ってスクレイピング
    options = ChromeOptions()
    # ヘッドレスモードを有効にする(次driver = Chrome(executable_path='/chromedriver' ,options=options)の行をコメントアウトすると画面が表示される)
    options.add_argument('--headless')
    # no sandbox で実行しないとcronが使えなかった
    options.add_argument('--no-sandbox')
    # ChromeのWebDriverオブジェクトを作成する
    driver = Chrome(executable_path='/home/ubuntu/bin/chromedriver',
                    chrome_options=options)
    # 指定した待ち時間の間、要素が見つかるまで(ロードされるまで)待機するように設定。短いとTimeoutExceptionになった。
    driver.implicitly_wait(10)
    # atwikiのログインページなどはurlの直接入力で表示されないようになっているので、別のページからseleniumでログインページへ遷移する必要がある。
    driver.get("https://www65.atwiki.jp/44teck/pages/1.html")
    print("サイトに接続中...")
    driver.find_element_by_link_text("ログイン").click()
    driver.find_element_by_name("user").send_keys(user)
    driver.find_element_by_name("pass").send_keys(passward)
예제 #39
0
from selenium import webdriver
from time import sleep
# 实现无可视化界面
from selenium.webdriver.chrome.options import Options
# 实现规避检测
from selenium.webdriver import ChromeOptions

# 实现无可视化界面的操作
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

# 实现规避检测
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])

# 如何实现让selenium规避被检测到的风险
bro = webdriver.Chrome(executable_path='./chromedriver.exe',
                       chrome_options=chrome_options,
                       options=option)

# 无可视化界面(无头浏览器)-还有可以使用 phantomJs
bro.get('https://www.baidu.com')

print(bro.page_source)
sleep(2)
bro.quit()
예제 #40
0
def set_driver(driver_path, headless_flg):
    # Chromeドライバーの読み込み
    options = ChromeOptions()

    # ヘッドレスモード(画面非表示モード)をの設定
    if headless_flg == True:
        options.add_argument('--headless')

    # 起動オプションの設定
    options.add_argument(
        '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
    )
    #options.add_argument('log-level=3')
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--ignore-ssl-errors')
    options.add_argument('--incognito')  # シークレットモードの設定を付与

    # ChromeのWebDriverオブジェクトを作成する。
    return Chrome(executable_path=os.getcwd() + "/" + driver_path,
                  options=options)
예제 #41
0
import os
import time
from helium import *
from selenium.webdriver import ChromeOptions

TD_URL = os.getenv('TD_URL')
TD_LOGIN = os.getenv('TD_LOGIN')
TD_PASSWORD = os.getenv('TD_PASSWORD')

AMEX_URL = os.getenv('AMEX_URL')
AMEX_LOGIN = os.getenv('AMEX_LOGIN')
AMEX_PASSWORD = os.getenv('AMEX_PASSWORD')

options = ChromeOptions()
options.add_argument("--disable-infobars")
options.add_argument("--start-maximized")
options.add_argument("--disable-extensions")
options.add_argument('--disable-notifications')
# https://stackoverflow.com/questions/38684175/how-to-click-allow-on-show-notifications-popup-using-selenium-webdriver
options.add_experimental_option(
    "prefs", {"profile.default_content_setting_values.notifications": 2})

# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument("--disable-notifications")
# driver = webdriver.Chrome(options=chrome_options)
# set_driver(driver)
# get_driver()

#
# TD
#
예제 #42
0
파일: api.py 프로젝트: mrooney/mintapi
def get_web_driver(email, password, headless=False, mfa_method=None,
                   mfa_input_callback=None, wait_for_sync=True,
                   session_path=None, imap_account=None, imap_password=None,
                   imap_server=None, imap_folder="INBOX"):
    if headless and mfa_method is None:
        warnings.warn("Using headless mode without specifying an MFA method"
                      "is unlikely to lead to a successful login. Defaulting --mfa-method=sms")
        mfa_method = "sms"

    zip_type = ""
    executable_path = os.getcwd() + os.path.sep + 'chromedriver'
    if _platform in ['win32', 'win64']:
        executable_path += '.exe'

    zip_type = CHROME_ZIP_TYPES.get(_platform)

    if not os.path.exists(executable_path):
        zip_file_url = CHROME_DRIVER_BASE_URL % (CHROME_DRIVER_VERSION, zip_type)
        request = requests.get(zip_file_url)

        if request.status_code != 200:
            raise RuntimeError('Error finding chromedriver at %r, status = %d' %
                               (zip_file_url, request.status_code))

        zip_file = zipfile.ZipFile(io.BytesIO(request.content))
        zip_file.extractall()
        os.chmod(executable_path, 0o755)

    chrome_options = ChromeOptions()
    if headless:
        chrome_options.add_argument('headless')
        chrome_options.add_argument('no-sandbox')
        chrome_options.add_argument('disable-dev-shm-usage')
        chrome_options.add_argument('disable-gpu')
        # chrome_options.add_argument("--window-size=1920x1080")
    if session_path is not None:
        chrome_options.add_argument("user-data-dir=%s" % session_path)

    driver = Chrome(chrome_options=chrome_options, executable_path="%s" % executable_path)
    driver.get("https://www.mint.com")
    driver.implicitly_wait(20)  # seconds
    try:
        element = driver.find_element_by_link_text("Log In")
    except NoSuchElementException:
        # when user has cookies, a slightly different front page appears
        driver.implicitly_wait(0)  # seconds
        element = driver.find_element_by_link_text("LOG IN")
        driver.implicitly_wait(20)  # seconds
    element.click()
    time.sleep(1)
    email_input = driver.find_element_by_id("ius-userid")
    # It's possible that the user clicked "remember me" at some point, causing
    # the email to already be present. If anything is in the input, clear it
    # and use the provided email, just to be safe.
    # email_input.setAttribute("value", "")
    email_input.clear()
    email_input.send_keys(email)
    driver.find_element_by_id("ius-password").send_keys(password)
    driver.find_element_by_id("ius-sign-in-submit-btn").submit()

    # Wait until logged in, just in case we need to deal with MFA.
    while not driver.current_url.startswith(
            'https://mint.intuit.com/overview.event'):
        # An implicitly_wait is also necessary here to avoid getting stuck on
        # find_element_by_id while the page is still in transition.
        driver.implicitly_wait(1)
        time.sleep(1)

        # bypass "Let's add your current mobile number" interstitial page
        try:
            skip_for_now = driver.find_element_by_id('ius-verified-user-update-btn-skip')
            skip_for_now.click()
        except (NoSuchElementException, StaleElementReferenceException, ElementNotVisibleException):
            pass

        driver.implicitly_wait(1)  # seconds
        try:
            driver.find_element_by_id('ius-mfa-options-form')
            try:
                mfa_method_option = driver.find_element_by_id('ius-mfa-option-{}'.format(mfa_method))
                mfa_method_option.click()
                mfa_method_submit = driver.find_element_by_id("ius-mfa-options-submit-btn")
                mfa_method_submit.click()

                if mfa_method == 'email' and imap_account:
                    mfa_code = get_email_code(imap_account, imap_password, imap_server, imap_folder=imap_folder)
                else:
                    mfa_code = (mfa_input_callback or input)("Please enter your 6-digit MFA code: ")
                mfa_code_input = driver.find_element_by_id("ius-mfa-confirm-code")
                mfa_code_input.send_keys(mfa_code)

                mfa_code_submit = driver.find_element_by_id("ius-mfa-otp-submit-btn")
                mfa_code_submit.click()
            except Exception:  # if anything goes wrong for any reason, give up on MFA
                mfa_method = None
                warnings.warn("Giving up on handling MFA. Please complete "
                              "the MFA process manually in the browser.")
        except NoSuchElementException:
            pass
        finally:
            driver.implicitly_wait(20)  # seconds

    # Wait until the overview page has actually loaded, and if wait_for_sync==True, sync has completed.
    if wait_for_sync:
        try:
            # Status message might not be present straight away. Seems to be due
            # to dynamic content (client side rendering).
            status_message = WebDriverWait(driver, 30).until(
                expected_conditions.visibility_of_element_located(
                    (By.CSS_SELECTOR, ".SummaryView .message")))
            WebDriverWait(driver, 5 * 60).until(
                lambda x: "Account refresh complete" in status_message.get_attribute('innerHTML')
            )
        except (TimeoutException, StaleElementReferenceException):
            warnings.warn("Mint sync apparently incomplete after 5 minutes. Data "
                          "retrieved may not be current.")
    else:
        driver.find_element_by_id("transaction")

    return driver
예제 #43
0
파일: config.py 프로젝트: rashevskyv/selene
 def set_chrome():
     return Chrome(executable_path=ChromeDriverManager().install(),
                   options=ChromeOptions())
    def get_options(self) -> ChromeOptions:
        options = ChromeOptions()
        options.add_argument("--disable-extensions")
        options.add_argument("--disable-gpu")
        options.add_argument("--dns-prefetch-disable")
        options.add_argument("--enable-automation")
        options.add_argument("--enable-javascript")
        options.add_argument("--no-sandbox")
        options.add_argument("--page-load-strategy=normal")
        options.add_argument("--user-data-dir=" +
                             os.path.abspath("./selenium"))  # enable cookies
        options.add_argument("--profile-directory=Default")
        # options.add_argument("--remote-debugging-port=9222")
        options.add_argument("--headless")

        return options