def __init__(self, environment: Environment, headless: bool): options = Options() self.headless = headless options.headless = self.headless for arg in [ "--disable-translate", "--disable-extensions", "--disable-background-networking", "--safebrowsing-disable-auto-update", "--disable-sync", "--metrics-recording-only", "--disable-default-apps", "--no-first-run", "--disable-setuid-sandbox", "--hide-scrollbars", "--no-sandbox", "--no-zygote", "--autoplay-policy=no-user-gesture-required", "--disable-notifications", "--disable-logging", "--disable-permissions-api", ]: options.add_argument(arg) # hide infobar about automation options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) # workaround for the first page being way to slow to load # ~2 minutes for my case (caused by some useless element being slow?) options.page_load_strategy = "eager" super().__init__(options=options) self.environment = environment self.start_time = None time.sleep(1) self.command_executor._commands["SEND_COMMAND"] = ("POST", "/session/$sessionId/chromium/send_command")
def __init__(self, username, password): self.username = username self.password = password self.base_url = 'https://www.instagram.com' # the options from this website -> https://www.selenium.dev/documentation/en/webdriver/page_loading_strategy/ options = Options() options.page_load_strategy = 'eager' self.driver = webdriver.Chrome('chromedriver.exe', options=options) self.database = db.Database()
def __init__(self): self.review_dataset = [] self.index_file = ReviewCrawler.index_file options = Options() options.page_load_strategy = 'eager' self.driver = webdriver.Chrome( options=options, executable_path="/Users/ssamko/Downloads/chromedriver") print(self.index_file)
def __init__(self): self.config = self.getConfig() self.checkout_url = 'https://www.bestbuy.com/checkout/r/fast-track' options = Options() options.page_load_strategy = 'normal' self.driver = webdriver.Chrome('chromedriver.exe', options=options) self.main()
def get_browser(): if BROWSER_TYPE.lower().find("chrome") >= 0: options = Options() options.page_load_strategy = 'eager' browser = Chrome(options=options) elif BROWSER_TYPE.lower().find("firefox") >= 0: browser = Firefox() else: raise Exception(f"I'm sorry {BROWSER_TYPE} browser is not supported") return browser
def get_stats(player): names = player[0] url = player[1] options = Options() options.page_load_strategy = 'none' try: driver = drivers[threading.current_thread().name] except KeyError: drivers[threading.current_thread().name] = webdriver.Chrome( executable_path=DRIVER_PATH, options=options) driver = drivers[threading.current_thread().name] driver.get(url) table = driver.find_element_by_xpath('//*[@id="totals"]/tbody') try: all_star = driver.find_element_by_xpath('//*[@id="all_star"]/tbody') years = [ year.text for year in all_star.find_elements_by_xpath('./tr/th') ] except: years = [] season_totals = [] #Get season totals by row for row in table.find_elements_by_xpath('./tr'): season = [td.text for td in row.find_elements_by_xpath("./td")] season.insert(0, row.find_element_by_xpath('./th').text) if len(season) == 32: season.pop(30) elif len(season) == 30: season.append('0') if season[0] in years: season.append('Yes') else: season.append('No') season.insert(0, names) for i in range(len(season)): if season[i] == '': season[i] = '0' season_totals.append(season) #Check if all star #Write to file with open('players_total.csv', 'a', newline='', encoding='utf-8') as file: write = csv.writer(file) write.writerows(season_totals)
def stock_crawler(url): driver_path = mac_chromedriver_path options = Options() options.page_load_strategy = 'normal' options.add_argument('--headless') options.add_argument('window-size=1920x1080') options.add_argument("lang=ko_KR") options.add_argument( "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/81.0.4044.122 Safari/537.36") options.add_argument('--log-level=3') options.add_argument('--disable-loggin') options.add_argument(mac_user_chrome_data_path) driver = webdriver.Chrome(executable_path=driver_path, options=options) driver.get(url) driver.execute_script( "Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5]}})" ) driver.execute_script( "Object.defineProperty(navigator, 'languages', {get: function() {return ['ko-KR', 'ko']}})" ) driver.execute_script( "const getParameter = WebGLRenderingContext.getParameter;WebGLRenderingContext.prototype.getParameter = function(parameter) {if (parameter === 37445) {return 'NVIDIA Corporation'} if (parameter === 37446) {return 'NVIDIA GeForce GTX 980 Ti OpenGL Engine';}return getParameter(parameter);};" ) time.sleep(5) if url is ssg: el_stock = driver.find_elements_by_css_selector('#oriCart') print(el_stock) el_stock = el_stock[0].get_attribute('outerHTML') print(el_stock) if 'soldout' in el_stock: # sendChannelMsg(f'현재 ssg에 닌텐도 스위치 동물의 숲 에디션 재고가 없습니다.\n다시 확인하고 알려드릴게요!\n{url}') pass else: print('ssg 재고 있음') elif url is coupang: el_stock = driver.find_elements_by_css_selector( "#contents > div.prod-atf > div > div.prod-buy.sold-out.new-oos-style.not-loyalty-member.eligible-address.without-subscribe-buy-type.DISPLAY_0.only-one-delivery > div.prod-price-container" ) el_stock = el_stock[0].get_attribute('outerHTML') if '일시품절' in el_stock: # sendChannelMsg(f'현재 coupang에 닌텐도 스위치 동물의 숲 에디션 재고가 없습니다.\n다시 확인하고 알려드릴게요!\n{url}') pass else: print('coupang 재고 있음') driver.quit()
def get_webdriver_options(output_dir): options = Options() options.page_load_strategy = 'normal' options.add_argument("--disable-notifications") options.add_argument("--disable-popup-blocking") prefs = { "profile.default_content_settings.popups": 0, 'download.directory_upgrade': True, "download.default_directory": os.path.join(os.getcwd(), output_dir) } options.add_experimental_option("prefs", prefs) return options
def getWebDriver(args): """ Get the appopiate driver for chosen browser """ driver = None if args.chrome: options = ChromeOptions() options.page_load_strategy = 'eager' # Default profile directory userDataDir = os.getenv( 'LOCALAPPDATA') + "\\Google\\Chrome\\User Data" if args.profile == None else args.profile options.add_argument("user-data-dir=" + userDataDir) driver = Chrome(options=options) elif args.firefox: options = FirefoxOptions() options.page_load_strategy = 'eager' # Default profile directory profiles = os.listdir(os.getenv('APPDATA') + "\\Mozilla\\Firefox\\Profiles\\") # "xxxxxxxx.default-release" is the default profile for Release versions v67+ default_profile = next( profile for profile in profiles if profile[-15:] == "default-release") userDataDir = os.getenv('APPDATA')+"\\Mozilla\\Firefox\\Profiles\\" + \ default_profile if args.profile == None else args.profile fp = FirefoxProfile(userDataDir) driver = Firefox(fp, options=options) elif args.edge: options = EdgeOptions() options.page_load_strategy = 'eager' options.use_chromium = True # Default profile directory userDataDir = os.getenv( 'LOCALAPPDATA') + "\\Microsoft\\Edge\\User Data" if args.profile == None else args.profile options.add_argument("user-data-dir=" + userDataDir) driver = Edge(options=options) return driver
def __init__(self, id, database_creds, ng): self.cnxn = pyodbc.connect(database_creds.get_connectioN_string(), autocommit=True) self.ng = ng pid = os.getpid() cursor = self.cnxn.cursor() self.save_cursor = self.cnxn.cursor() option = Options() option.add_argument("--disable-infobars") option.add_argument("--disable-gpu") option.add_argument("--start-maximized") # option.add_argument("--headless") # option.add_argument("--window-size=1024,768") option.add_argument("--disable-extensions") option.add_argument("--disable-translate") option.add_argument("--allow-file-access-from-files") option.add_argument("--disable-dev-shm-usage") option.page_load_strategy = 'eager' # option.add_argument("--enable-usermedia-screen-capturing") # option.add_argument("--use-fake-ui-for-media-stream") # option.add_argument("--use-fake-device-for-media-stream") # option.add_argument("--use-fake-ui-device-for-media-stream") # option.add_argument("--use-file-for-fake-video-capture=C:\\temp\\bunnyvideo.mjpeg") # option.add_argument("--use-file-for-fake-audio-capture=C:\\temp\\bunny.opus") option.add_argument("--enable-tracing") # option.add_argument("--enable-tracing-output = c:\\temp\\log.txt") # Pass the argument 1 to allow and 2 to block # option.add_experimental_option("prefs", { # "profile.default_content_setting_values.notifications": 2 # }) option.set_capability('unhandledPromptBehavior', 'accept') secret = Secret() self.driver = webdriver.Remote(command_executor=secret.URL, desired_capabilities={ "browserName": "chrome", }, options=option) self.step = Test() self.session = session.Session() self.master_id = id self.run()
def conectarWeb(): global driver options = Options() options.page_load_strategy = 'eager' driver = webdriver.Chrome('/usr/bin/chromedriver', options=options) driver.maximize_window() time.sleep(1) driver.get(link) ###quitar cookie WebDriverWait(driver, 5)\ .until(EC.element_to_be_clickable((By.XPATH, '/html/body/div/p[2]/a')))\ .click() return driver
def get_default_driver_options(width=1472, height=828, headless=True) -> Options: """ Generate default Chrome driver options :param width: int :param height: int :param headless: bool :return: Options """ chrome_options = Options() chrome_options.headless = headless chrome_options.page_load_strategy = 'normal' chrome_options.add_argument('--enable-automation'), chrome_options.add_argument('--start-maximized'), chrome_options.add_argument(f'--window-size={width},{height}'), chrome_options.add_argument('--lang=en-GB'), chrome_options.add_argument('--no-sandbox'), chrome_options.add_argument('--disable-setuid-sandbox'), chrome_options.add_argument('--disable-gpu'), chrome_options.add_argument('--disable-dev-shm-usage'), chrome_options.add_argument('--no-sandbox'), chrome_options.add_argument('--disable-setuid-sandbox'), chrome_options.add_argument('--disable-dev-shm-usage'), chrome_options.add_argument("-proxy-server='direct://"), chrome_options.add_argument('--proxy-bypass-list=*'), chrome_options.add_argument('--disable-accelerated-2d-canvas'), chrome_options.add_argument('--disable-gpu'), chrome_options.add_argument('--allow-running-insecure-content'), chrome_options.add_argument('--disable-web-security'), chrome_options.add_argument('--disable-client-side-phishing-detection'), chrome_options.add_argument('--disable-notifications'), chrome_options.add_argument('--mute-audio'), # Disable downloads chrome_options.add_experimental_option( 'prefs', { 'safebrowsing.enabled': 'false', 'download.prompt_for_download': False, 'download.default_directory': '/dev/null', 'download_restrictions': 3, 'profile.default_content_setting_values.notifications': 2, }) return chrome_options
def create_driver_instance(): user_agent = random.choice(user_agent_list) chrome_options = Options() chrome_options.add_argument('headless') chrome_options.add_argument("window-size=1024,768") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--disable-plugins") chrome_options.add_argument("--disable-images") chrome_options.add_argument('blink-settings=imagesEnabled=false') chrome_options.add_argument(f'user-agent={user_agent}') chrome_options.add_argument('log-level=3') chrome_options.page_load_strategy = 'EAGER' driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options) # Chrome driver.set_page_load_timeout(5000) driver.set_script_timeout(5000) return driver
def wikiscrap(): options = Options() options.page_load_strategy = 'none' driver = webdriver.Chrome(executable_path='chromedriver', options=options) driver.get('https://en.wikipedia.org/wiki/List_of_Indian_dishes') # Finding all the names of foods from a wikipedia page foodname = driver.find_elements_by_xpath("//tbody/tr/td[position()=1]") # Rejecting empty cells foodlist = [make_tags(x.text) for x in foodname if x.text != ''] print("No of names scrapped =", len(foodlist)) driver.quit() food = pd.DataFrame(foodlist, columns=['food_name']) food.to_csv('food.csv')
def html_parse(value): options = Options() options.page_load_strategy = 'eager' boo = True driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) driver.get('https://soundcloud.com/' + value + '/tracks') soup = BeautifulSoup(driver.page_source, 'html.parser') if soup.find('title').text == "Something went wrong on SoundCloud": boo = False soup = None while (boo): soup = BeautifulSoup(driver.page_source, 'html.parser') if soup.find('div', class_='paging-eof') is None: driver.execute_script( "window.scrollTo(0,document.body.scrollHeight)") time.sleep(0.5) else: boo = False return soup
def get_driver(self): if self.use_custom_urls: self.url = self.custom_urls if self.settings.use_chrome: options = Options() #options.binary_location(executable_path=self.settings.custom_chrome_exe_path) options.add_argument('--disable-dev-shm-usage') #options.binary_location = self.settings.custom_chrome_exe_path options.page_load_strategy = "eager" options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) #options.add_argument('--no-proxy-server') #options.add_argument("--proxy-server='direct://'") #options.add_argument("--proxy-bypass-list=*") #options.add_argument('--blink-settings=imagesEnabled=false') #options.add_argument("--no-sandbox"); if not self.load_images: prefs = {"profile.managed_default_content_settings.images": 2} options.add_experimental_option("prefs", prefs) if self.headless_mode: options.add_argument('--headless') options.add_argument('--no-proxy-server') if self.settings.use_chrome_profile: options.add_argument("--user-data-dir={}".format(self.settings.chrome_profile_path)) # .profile-bb options.add_argument('--profile-directory=Default') if self.settings.custom_chrome_exe_path != "" or None: #driver = webdriver.Chrome(self.settings.custom_chrome_exe_path, options=options) driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) else: driver = webdriver.Chrome(options=options) else: fireFoxOptions = webdriver.FirefoxOptions() firefox_profile = webdriver.FirefoxProfile(self.settings.custom_firefox_exe_path) if self.headless_mode: fireFoxOptions.set_headless() elif not self.load_images: firefox_profile.set_preference('permissions.default.image', 2) firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') driver = webdriver.Firefox(firefox_profile=firefox_profile, firefox_options=fireFoxOptions) return driver
def make_options(self, headless=True, proxing=False): '''Конфигурирование браузера''' dict_options = {} options = Options() if headless: options.add_argument("--headless") if self.binary_location: options.binary_location = (self.binary_location) options.page_load_strategy = 'eager' options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('user-agent={}'.format(load_ua())) dict_options['options'] = options if proxing: prox = Proxy() prox.proxy_type = ProxyType.MANUAL prox.http_proxy = 'socks5://127.0.0.1:9050' prox.ssl_proxy = 'socks5://127.0.0.1:9050' capabilities = webdriver.DesiredCapabilities.CHROME prox.add_to_capabilities(capabilities) dict_options['desired_capabilities'] = capabilities return dict_options
def __init__(self, environment: Environment, headless: bool): chrome_options = Options() self.headless = headless chrome_options.headless = self.headless # workaround for the first page being way to slow to load # ~2 minutes for my case (caused by some useless element being slow?) chrome_options.page_load_strategy = "eager" super().__init__(options=chrome_options) self.environment = environment self.start_time = None self.command_executor._commands["SEND_COMMAND"] = ( "POST", "/session/$sessionId/chromium/send_command") self.execute( "SEND_COMMAND", dict( cmd="Network.emulateNetworkConditions", params={ "offline": False, "latency": 100, "downloadThroughput": 50000, "uploadThroughput": 50000 }, ), )
def main(): crawler = ReviewCrawler() while True: try: crawler.start_idx = crawler.get_start_idx() crawler.crawl_from(crawler.start_idx) print('finish?') break except Exception as e: print(e) print('timeout error: stop') time.sleep(2) crawler.driver.quit() options = Options() options.page_load_strategy = 'eager' crawler.driver = webdriver.Chrome( options=options, executable_path="/Users/ssamko/Downloads/chromedriver") crawler.driver.set_window_position(-10000, 0) crawler.start_idx = crawler.get_start_idx() if crawler.start_idx == -1: print('finish well') break continue
BEST_BUY_ADD_TO_CART_API_URL = "https://www.bestbuy.com/cart/api/v1/addToCart" BEST_BUY_CHECKOUT_URL = "https://www.bestbuy.com/checkout/c/orders/{order_id}/" WEBHOOK_URL = "https://discordapp.com/api/webhooks/778423506294931478/rFcPl55WCL_y-ucIpEBekQ08xoXJhVw5GjqkoXZLPvP3PwRbTCc9y7mhdGHH6dmS-IxO" DEFAULT_HEADERS = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", "user-agent": settings.userAgent, "origin": "https://www.bestbuy.com", } options = Options() options.page_load_strategy = "eager" options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) prefs = {"profile.managed_default_content_settings.images": 2} options.add_experimental_option("prefs", prefs) options.add_argument("user-data-dir=.profile-bb") class BestBuy: def __init__(self, task_id, status_signal, image_signal, product, profile, proxy, monitor_delay, error_delay): self.task_id, self.status_signal, self.image_signal, self.product, self.profile, self.monitor_delay, self.error_delay = task_id, status_signal, image_signal, product, profile, float( monitor_delay), float(error_delay) self.sku_id = parse.parse_qs(parse.urlparse( self.product).query)['skuId'][0] self.session = requests.Session()
os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) import config conf = config.Config() BASE_DIR = os.path.dirname(os.path.realpath(__file__)) mongo = MongoClient(f"mongodb://{conf.MONGO_REMOTE_IP}:27017") db = mongo['aircode'] review_col = db['nsmall_reviews'] prod_col = db['nsmall'] today = datetime.date.today() prod_list = list(prod_col.find({'reg_date': str(today)})) print(len(prod_list)) review_dataset = [] options = Options() options.page_load_strategy = 'eager' driver = webdriver.Chrome( options=options, executable_path="/Users/ssamko/Downloads/chromedriver") if not os.path.exists(os.path.join(BASE_DIR, f'daily')): os.mkdir(os.path.join(BASE_DIR, f'daily')) daily_index_dir = os.path.join(BASE_DIR, f'daily/{today}') if not os.path.exists(daily_index_dir): os.mkdir(daily_index_dir) def runtimer(func): @functools.wraps(func) def wrapper(*args, **kwargs):
import fitz from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options import time import numpy as np url = 'https://www.ey.com/en_gl/tax-alerts' # chrome webdriver options options = Options() options.page_load_strategy = 'normal' options.add_argument("--start-maximized") options.add_argument('--disable-extensions') #%% # I will get the full list of countries from EY webpage driver = webdriver.Chrome(options=options) driver.get(url) time.sleep(2) # clicking in "I decline optional cookies" try: WebDriverWait(driver, 10)\ .until(EC.presence_of_element_located((By.XPATH, '//*[@id="cookiePolicy"]/div/div[2]/button[1]')))\
def main(): print('(webdrv) main:') print() urlbase = 'https://kroger-gcm.semi.cashstar.com' submalls = ('', 'bakersplus', 'city-market', 'dillons', 'food-4-less', 'foods-co', 'fred-meyer', 'frys-food', 'gerbes', 'jay-c-foods', 'king-soopers', 'marianos', 'metro-market', 'payless', 'pick-n-save', 'qfc', 'ralphs', 'smiths', 'kroger') # todo: any user input # # service = Service('/usr/local/bin/chromedriver') # service.start() options = Options() options.page_load_strategy = 'eager' driver = webdriver.Chrome(options=options) # driver = webdriver.Remote(service.service_url) # submall = None # driver.get("http://www.google.com") # # submall_validation_data = submall_validation(driver, submall) # time.sleep(5) submall = "" driver.get(urlbase) submall_validation_data = submall_validation(driver, submall) time.sleep(5) for submall in submalls: url = urlbase + f'/{submall}' driver.get(url) submall_validation_data = submall_validation(driver, submall) time.sleep(3) # submall = "bakersplus" # driver.get("https://kroger-gcm.semi.cashstar.com/bakersplus") # submall_validation_data = submall_validation(driver, submall) # time.sleep(5) # # submall = "city-market" # driver.get("https://kroger-gcm.semi.cashstar.com/city-market") # submall_validation_data = submall_validation(driver, submall) # time.sleep(5) # # submall = "dillons" # driver.get("https://kroger-gcm.semi.cashstar.com/dillons") # submall_validation_data = submall_validation(driver, submall) # time.sleep(5) # # submall = "kroger" # driver.get("https://kroger-gcm.semi.cashstar.com/kroger") # submall_validation_data = submall_validation(driver, submall) # time.sleep(5) driver.quit() print() print('(webdrv) end::') return 0
def load_chromedriver(self, path): chrome_options = Options() chrome_options.page_load_strategy = 'eager' return webdriver.Chrome(path, options=chrome_options)
def main(): options = Options() options.add_experimental_option("detach", True) # 不加载图片 prefs = {"profile.managed_default_content_settings.images": 2} options.add_experimental_option("prefs", prefs) # Page loading strategy options.page_load_strategy = 'none' """ Page loading strategy: normal - This will make Selenium WebDriver to wait for the entire page is loaded. When set to normal, Selenium WebDriver waits until the load event fire is returned. By default normal is set to browser if none is provided. eager - This will make Selenium WebDriver to wait until the initial HTML document has been completely loaded and parsed, and discards loading of stylesheets, images and subframes. none - When set to none Selenium WebDriver only waits until the initial page is downloaded. """ #设置成用户自己的数据目录: "chrome://version/" # (原用户参数为复制以为"Default",另复制一份为"DefaultCopy"程序才不会报错) # options.add_argument('--user-data-dir=C:\\Users\\Caviar\\AppData\\Local\\Temp\\scoped_dir3768_856195622\\DefaultCopy') # options.binary_location指定360极速浏览器路径 options.binary_location = 'C:\\Users\\Caviar\\AppData\\Local\\360Chrome\\Chrome\\Application\\360chrome.exe' # 创建Chrome驱动实例,executable_path指定chromedriver路径 driver = webdriver.Chrome( options=options, executable_path= 'C:\\Users\\Caviar\\AppData\\Local\\360Chrome\\Chrome\\Application\\chromedriver.exe' ) # 最小化浏览器 #driver.minimize_window() for url_season in l_season: dict_season = {} josn_fn = url_season.split('/')[-2] print([josn_fn]) #driver ---- class WebDriver(selenium.webdriver.remote.webdriver.WebDriver) driver.get(url_season) # 经过浏览器渲染以及标签清洗过后的HTML页面 html_season_ = driver.page_source #''' with open('经过浏览器渲染以及标签清洗过后的HTML页面.html', 'w', encoding='utf-8') as f: f.write(html_season_) #''' html_season = etree.HTML(html_season_) # 每集描述 for a in html_season.xpath( '//*[@id="default-page"]/div/div/div/div/figure/ul/li/figure/a' ): #print(a) #print([etree.tounicode(a)]) #print() try: figcaption = a.xpath('./following-sibling::*[1]')[0] img_episode = a.xpath('./img/@src')[0] url_episode = a.xpath('./@href')[0] #print(etree.tounicode(figcaption)) strong_eng = figcaption.xpath('./div/a[1]/div/strong')[0] strong_chn = figcaption.xpath('./div/a[1]/strong')[0] discription_eng = strong_eng.text discription_chn = strong_chn.text # 进入二级网页 # 如果二级页面使用requests从服务器直接获取的HTML源数据标签规范,那么可以直接使用requests访问,抓取效率更高 driver.get(url_episode) #driver.set_page_load_timeout(time_to_wait=10) driver.implicitly_wait(time_to_wait=0) html_episode_ = driver.page_source html_episode = etree.HTML(html_episode_) a_mp4 = html_episode.xpath( '//*[@id="default-page"]/div/div/div/div/div[1]/div/div/section[2]/div/div/div[1]/div/div/div/div/div/a' )[0] mp4_url = a_mp4.xpath('./@href')[0] print(mp4_url) d = {mp4_url: [discription_eng, discription_chn, img_episode]} dict_season.update(d) print(d) except: #### 将python_data写入data.json文件 with open(f'{josn_fn}【无】.txt', 'w', encoding='utf-8') as f: f.write('') break if dict_season != {}: #### 将python_data写入data.json文件 with open(f'{josn_fn}.json', 'w', encoding='utf-8') as f: json.dump(dict_season, f, sort_keys=True, indent=4, ensure_ascii=False) else: pass
def search_site(website, product="mens slim jeans"): ''' opens website with selenium's find elem ''' url = str(website) searchfor = str(product) # getting the iphone content version (hopefully less content) headers = { 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1' } options = Options() options.page_load_strategy = 'eager' # waits only til html is loaded and parsed, no stylesheets, images etc username = os.path.expanduser('~') # only on mac, if windows -> change the path to C:\\ download_path = username + '/Downloads/chromedriver' browser = webdriver.Chrome(download_path, options=options) browser.get(url) # browser.set_page_load_timeout(10) product_price = {} # this is super inefficient and annoying to copypasta if you know of a better way Let me know! # If there is a search box on the site - I am assuming it is the first input element or form element.. not perfect but makes sense, right? # attempt at every possible way a website may have named its search box !! # try: # browser.find_element_by_name('search') # searchbox = browser.find_element_by_name('search') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # except selenium.common.exceptions.NoSuchElementException: # pass # elif browser.find_element_by_name('search-input'): # searchbox = browser.find_element_by_name('search-input') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('searchbox'): # searchbox = browser.find_element_by_name('searchbox') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('search-box'): # searchbox = browser.find_element_by_name('search-box') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('searchbar'): # searchbox = browser.find_element_by_name('searchbox') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('search-bar'): # searchbox = browser.find_element_by_name('search-bar') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('input'): # searchbox = browser.find_element_by_name('input') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('keyword'): # searchbox = browser.find_element_by_name('keyword') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('key-word'): # searchbox = browser.find_element_by_name('key-word') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('key-words'): # searchbox = browser.find_element_by_name('key-words') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('keywords'): # searchbox = browser.find_element_by_name('keywords') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('keywordSearch'): # searchbox = browser.find_element_by_name('keywordSearch') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('globalSearchInputField'): # searchbox = browser.find_element_by_name('globalSearchInputField') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('globalSearch'): # searchbox = browser.find_element_by_name('globalSearchInput') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('globalSearchInput'): # searchbox = browser.find_element_by_name('globalSearchInput') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_name('globalSearchField'): # searchbox = browser.find_element_by_name('globalSearchField') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_id('search'): # searchbox = browser.find_element_by_id('search') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_id('search-input'): # searchbox = browser.find_element_by_id('search-input') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_id('searchbox'): # searchbox = browser.find_element_by_id('searchbox') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_id('searchbar'): # searchbox = browser.find_element_by_id('searchbar') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_id('input'): # searchbox = browser.find_element_by_id('input') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_id('globalSearchInputField'): # searchbox = browser.find_element_by_id('globalSearchInputField') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_id('globalSearch'): # searchbox = browser.find_element_by_id('globalSearch') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_id('globalSearchInput'): # searchbox = browser.find_element_by_id('globalSearchInput') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_id('globalSearchField'): # searchbox = browser.find_element_by_id('globalSearchField') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_tag_name('input'): # searchbox = browser.find_element_by_tag('input') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_tag_name('form'): # searchform = browser.find_element_by_tag_name('form') # searchbox = searchform.find_element_by_tag_name('input') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_class_name('globalSearchInputField'): # searchbox = browser.find_element_by_class_name('globalSearchInputField') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_class_name('keyword'): # searchbox = browser.find_element_by_class_name('keyword') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_class_name('keywords'): # searchbox = browser.find_element_by_class_name('keywords') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_class_name('search'): # searchbox = browser.find_element_by_class_name('search') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_class_name('searchbox'): # searchbox = browser.find_element_by_class_name('searchbox') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_class_name('searchbar'): # searchbox = browser.find_element_by_class_name('searchbar') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_class_name('input'): # searchbox = browser.find_element_by_class_name('input') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_css_selector('p.search'): # searchbox = browser.find_element_by_css_selector('p.search') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_css_selector('p.searchbox'): # searchbox = browser.find_element_by_css_selector('p.searchbox') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_css_selector('p.searchbar'): # searchbox = browser.find_element_by_css_selector('p.searchbar') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_css_selector('p.input'): # searchbox = browser.find_element_by_css_selector('p.input') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_css_selector('p.search-input'): # searchbox = browser.find_element_by_css_selector('p.search-input') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # elif browser.find_element_by_css_selector('p.keywords'): # searchbox = browser.find_element_by_css_selector('p.keyword') # searchbox.send_keys(searchfor) # searchbox.send_keys(Keys.ENTER) # FUCKIT ignores errors and moves on with the code. # try else would have taken me too long but are effectively the same as fuckit module with fuckit: # TIMING TO AVOID BOT DENIAL ?? time.sleep(2) browser.find_element_by_name('search') searchbox = browser.find_element_by_name('search') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) # options2 = webdriver.ChromeOptions() # options2.add_argument('--ignore-certificate-errors') # options2.add_argument("--test-type") # options2.binary_location = "/usr/bin/chromium" # browser.save_screenshot('screenshot.png', chrome_options = options2) # do i need to reassign the new url? guess not print('current url', browser.current_url) # tag = browser.find_elements_by_tag_name('p') # tag2 = browser.find_elements_by_tag_name('a') # tag3 = browser.find_elements_by_class_name('prod_price') # tag4 = browser.find_elements_by_tag_name('span') # browser.find_elements_by_class_name('productPrice') # browser.find_elements_by_class_name('product-Price') # browser.find_elements_by_class_name('productprice') # browser.find_elements_by_class_name('Price') # price = browser.find_elements_by_class_name('price') # price2 = browser.find_elements_by_id('price') # browser.find_elements_by_class_name('prod_price') # browser.find_elements_by_class_name('prod_Price') # for item in price: # print(item.text) # for item in tag4: # print(item.text) # SOUP scrape page_response = requests.get(browser.current_url, headers=headers) page_content = bs(page_response.content, "html.parser") print(page_content.prettify) ul_lists = page_content.find_all('div') # print(ul_lists) # REGEX TRIAL # pattern = re.compile(r'[0-9][0-9][1-9]\.[0-9][0-9]?') # match = pattern.findall(str(page_content)) # i=0 # for m in match: # if i < 10: # print(m) # i += 1 # floats = re.findall("\d+\.\d+", str(page_content)) # for num in floats: # print(num) # print(num.text) # If-statement after search() tests if it succeeded # if match: # print('found', match.group()) ## 'found word:cat' # else: # print('did not find') # browser.find_elements_by_name('price') # browser.find_elements_by_name('product') # browser.find_elements_by_name('product-price') # browser.find_elements_by_name('productprice') # browser.find_elements_by_class_name('price') # browser.find_elements_by_class_name('product') # browser.find_elements_by_class_name('product-price') # browser.find_elements_by_class_name('productprice') # browser.find_elements_by_id('price') # browser.find_elements_by_id('product') # browser.find_elements_by_id('product-price') # browser.find_elements_by_id('productprice') browser.quit() with fuckit: searchbox = browser.find_element_by_name('search-input') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('searchbox') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('search-box') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('searchbox') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('search-bar') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('input') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('keyword') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('key-word') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('key-words') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('keywords') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('keywordSearch') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('globalSearchInputField') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('globalSearchInput') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('globalSearchInput') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_name('globalSearchField') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_id('search') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_id('search-input') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_id('searchbox') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_id('searchbar') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_id('input') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_id('globalSearchInputField') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_id('globalSearch') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_id('globalSearchInput') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_id('globalSearchField') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_tag('input') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchform = browser.find_element_by_tag_name('form') searchbox = searchform.find_element_by_tag_name('input') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_class_name( 'globalSearchInputField') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_class_name('keyword') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_class_name('keywords') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_class_name('search') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_class_name('searchbox') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_class_name('searchbar') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) with fuckit: searchbox = browser.find_element_by_class_name('input') searchbox.send_keys(searchfor) searchbox.send_keys(Keys.ENTER) print(browser.current_url) browser.quit() return 'website'
"httpProxy": PROXY, "ftpProxy": PROXY, "sslProxy": PROXY, "proxyType": "MANUAL", } with webdriver.Firefox() as driver: # Open URL driver.get("https://selenium.dev") from selenium import webdriver from selenium.webdriver.chrome.options import Options options = Options() options.page_load_strategy = 'normal' driver = webdriver.Chrome(options=options) # Navigate to url driver.get("http://www.google.com") driver.quit() from selenium import webdriver from selenium.webdriver.chrome.options import Options options = Options() options.page_load_strategy = 'eager' driver = webdriver.Chrome(options=options) # Navigate to url driver.get("http://www.google.com") driver.quit()
def main(start_i=0): conf = config.Config() options = Options() options.page_load_strategy = 'eager' driver = webdriver.Chrome( options=options, executable_path="/Users/ssamko/Downloads/chromedriver") # print(conf.MONGO_REMOTE_IP) # mongo = MongoClient(f"mongodb://localhost:27017") mongo = MongoClient(f"mongodb://{conf.MONGO_REMOTE_IP}:27017") db = mongo['aircode'] col = db['hmall_prod'] today = datetime.date.today() with open(f'crawler/hmall/daily/{today}.txt', 'r') as f: urls = f.readlines() for url in urls[start_i:]: # print(url) # data = requests.get(url) # soup = BeautifulSoup(data.text, 'html.parser') driver.get(url) WebDriverWait(driver, timeout=5).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.pdtCode'))) # with open('crawler/hmall/page_sample.txt','w') as sample: # sample.write(data.text) try: # prod_id = soup.select_one('div.pdtCode > span:nth-child(1)').text.split(":")[-1].strip() prod_id = driver.find_element_by_css_selector( 'div.pdtCode > span:nth-child(1)').text.split( ":")[-1].strip() prod_id = int(prod_id) print(prod_id) # prod_name = soup.select_one('h3.pdtTitle').text.strip() prod_name = driver.find_element_by_css_selector( 'h3.pdtTitle').text.strip() # print(prod_name) # price = soup.select_one('p.finalPrice.number.hasDC > strong').text.strip() price = driver.find_element_by_css_selector( 'p.finalPrice.number.hasDC > strong').text.strip() # print(price) except Exception as e: print(e) continue try: # score = soup.select_one('em.scoreMount').text.strip() score = driver.find_element_by_css_selector( 'em.scoreMount').text.strip() # print(score) # score_persons = soup.select_one('p.scoreNum').text[1:].split()[0] score_persons = driver.find_element_by_css_selector( 'p.scoreNum').text[1:].split()[0] # print(score_persons) except Exception as e: print(e) print('No score exists') score = None score_persons = 0 # img_url = soup.select_one('#prd_ipzoom > img')['src'] #prd_ipzoom > div._frm_magnifier > div > img img_url = driver.find_element_by_css_selector( '#prd_ipzoom > div._frm_magnifier > div > img').get_attribute( 'src') # prd_ipzoom > div._frm_magnifier > div > img # print(img_url) # 3 types of img path # t1 = soup.select('#guidance > table > tbody > tr > td > p') t1 = driver.find_elements_by_css_selector( '#guidance > table > tbody > tr > td > p') # t2 = soup.select('#deal_unit_d1 > dt > p') t2 = driver.find_elements_by_css_selector('#deal_unit_d1 > dt > p') #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(1) # t3 = soup.select('#section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p') t3 = driver.find_elements_by_css_selector( '#section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p' ) if t1: # print("t1 >>",t1) detail_imgs_p = t1 elif t2: # print("t2 >>",t2) detail_imgs_p = t2 elif t3: # print("t3 >>",t3) detail_imgs_p = t3 else: print(t1, t2, t3) print('weird type error') continue detail_urls = [] for p in detail_imgs_p: detail_img = p.find_elements_by_css_selector('img') if not detail_img: # print('not img p') continue # print('detail >>', detail_img['src']) # detail_img_url = detail_img['src'] # print(type(detail_img_url)) while detail_img: detail_urls.append(detail_img.pop().get_attribute('src')) # print(detail_urls) if not detail_urls: print('no detail img') # print(detail_urls[0][0][0][0][0]) #['src'] #guidance > table > tbody > tr > td > p:nth-child(1) > img #guidance > table > tbody > tr > td > p:nth-child(2) > img #guidance > table > tbody > tr > td > p:nth-child(2) > img #guidance > table > tbody > tr > td > p:nth-child(6) > img #guidance > table > tbody > tr > td > p:nth-child(2) > img #guidance > table > tbody > tr > td > p:nth-child(4) > img #guidance > table > tbody > tr > td > p:nth-child(2) > img #deal_unit_d1 > dt > p:nth-child(5) > img #deal_unit_d1 > dt > p:nth-child(3) > img #deal_unit_d1 > dt > p:nth-child(2) > img #deal_unit_d1 > dt > p:nth-child(7) > img #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(5) > img #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(5) > img #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(3) > img #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(2) > img #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(1) > img #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(3) > img db_data = { 'prod_id': prod_id, 'prod_name': prod_name, 'price': price, 'score': score, 'score_persons': score_persons, 'img_url': img_url, 'detail_img_url': detail_urls, 'reg_date': str(today) } # print(db_data) col.insert_one(db_data)
"proxyType": "MANUAL", } with webdriver.Chrome() as driver: driver.get("xxx.com") # 页面加载策略 """ document.readyState 属性描述当前页面的加载状态. 默认情况下, 在页面就绪状态是 complete 之前, WebDriver都将延迟 driver.get() 的响应或 driver.navigate().to() 的调用 """ # normal默认加载策略 # WebDriver等待整个页面的加载,设置为normal时,WebDriver保持等待直到返回load事件 options = Options() options.page_load_strategy = "normal" browser = webdriver.Chrome(options=options) browser.get("xxx.com") browser.quit() # eager加载策略 # WebDriver保持等待并直到完全加载并解析了html文件,忽略css样式表、图片和subframes的加载 # 设置为eager时,保持等待直到返回DOMContentLoaded事件 options.page_load_strategy = "eager" # none加载策略 # WebDriver仅等待至初始页面下载完成 # find element用于查找网页元素 # 从网页元素q中查找searchbox元素 search_box = browser.find_element_by_name("q")
def jsHtmlLoader(self, url, scroll=False, scroll_num=3, click=False, click_num=3, click_path='', turn=False, waitTime=1): '''Use selenium to get the fully loaded html fiel Wait and return the html content after a certain amount Args: url: str, rootURL of the root page Returns: html: str, fully loaded html with all ajax content ''' # get and set up proxy proxy_ip = self.get_proxy_ip() # check if the proxy is available # aval = self.check_proxy_ip(proxy_ip) # if aval: # proxy = f'http://{proxy_ip}' # try: # webdriver.DesiredCapabilities.CHROME['proxy'] = { # "httpProxy": proxy, # "proxyType": "MANUAL", # } # print(f'set up proxy for js loader: {proxy}') # except: # print('unable to set up proxy for js loader...') if proxy_ip != None: try: webdriver.DesiredCapabilities.CHROME['proxy'] = { "httpProxy": f'http://{proxy_ip}', 'httpsProxy': f'https://{proxy_ip}', "proxyType": "MANUAL", } print(f'Set up proxy for js loader: {proxy}') except: print('Uable to set up proxy for js loader...') # Set up headless Chrome browser options = Options() # options.add_argument('--headless') options.page_load_strategy = 'eager' prefs = { 'profile.default_content_setting_values': { 'images': 2, } } options.add_experimental_option('prefs', prefs) driver = webdriver.Chrome(options=options) # Request, wait and return the fully loaded html file driver.get(url) print('Waiting js to load...') time.sleep(waitTime) html = driver.page_source # If the pagination is to scroll, scroll the page then if scroll: self.scroll_after_request(driver, scroll_num) html = driver.page_source.encode('utf-8') # If the pagination is to click, click the page for 3 times default if click: if not turn: self.click_after_request(driver, click_num, click_path) html = driver.page_source.encode('utf-8') if turn: html = driver.page_source.encode('utf-8') self.click_after_request(driver, 1, click_path) next_page_url = driver.current_url # Close after several tries # if waitTime > 3: # driver.close() # Return the page or load the html again if html != None: # driver.close() return html if not turn else (html, next_page_url) else: print('Failing to load js...trying again\n') # driver.close() return self.jsHtmlLoader(url, waitTime + 0.5)