Exemplo n.º 1
0
 def __init__(self, environment: Environment, headless: bool):
     options = Options()
     self.headless = headless
     options.headless = self.headless
     for arg in [
         "--disable-translate",
         "--disable-extensions",
         "--disable-background-networking",
         "--safebrowsing-disable-auto-update",
         "--disable-sync",
         "--metrics-recording-only",
         "--disable-default-apps",
         "--no-first-run",
         "--disable-setuid-sandbox",
         "--hide-scrollbars",
         "--no-sandbox",
         "--no-zygote",
         "--autoplay-policy=no-user-gesture-required",
         "--disable-notifications",
         "--disable-logging",
         "--disable-permissions-api",
     ]:
         options.add_argument(arg)
     # hide infobar about automation
     options.add_experimental_option("excludeSwitches", ["enable-automation"])
     options.add_experimental_option("useAutomationExtension", False)
     # workaround for the first page being way to slow to load
     # ~2 minutes for my case (caused by some useless element being slow?)
     options.page_load_strategy = "eager"
     super().__init__(options=options)
     self.environment = environment
     self.start_time = None
     time.sleep(1)
     self.command_executor._commands["SEND_COMMAND"] = ("POST", "/session/$sessionId/chromium/send_command")
Exemplo n.º 2
0
 def __init__(self, username, password):
     self.username = username
     self.password = password
     self.base_url = 'https://www.instagram.com'
     # the options from this website -> https://www.selenium.dev/documentation/en/webdriver/page_loading_strategy/
     options = Options()
     options.page_load_strategy = 'eager'
     self.driver = webdriver.Chrome('chromedriver.exe', options=options)
     self.database = db.Database()
Exemplo n.º 3
0
 def __init__(self):
     self.review_dataset = []
     self.index_file = ReviewCrawler.index_file
     options = Options()
     options.page_load_strategy = 'eager'
     self.driver = webdriver.Chrome(
         options=options,
         executable_path="/Users/ssamko/Downloads/chromedriver")
     print(self.index_file)
Exemplo n.º 4
0
    def __init__(self):
        self.config = self.getConfig()
        self.checkout_url = 'https://www.bestbuy.com/checkout/r/fast-track'

        options = Options()
        options.page_load_strategy = 'normal'

        self.driver = webdriver.Chrome('chromedriver.exe', options=options)
        self.main()
Exemplo n.º 5
0
def get_browser():
    if BROWSER_TYPE.lower().find("chrome") >= 0:
        options = Options()
        options.page_load_strategy = 'eager'
        browser = Chrome(options=options)
    elif BROWSER_TYPE.lower().find("firefox") >= 0:
        browser = Firefox()
    else:
        raise Exception(f"I'm sorry {BROWSER_TYPE} browser is not supported")
    return browser
Exemplo n.º 6
0
def get_stats(player):

    names = player[0]
    url = player[1]

    options = Options()
    options.page_load_strategy = 'none'
    try:
        driver = drivers[threading.current_thread().name]
    except KeyError:
        drivers[threading.current_thread().name] = webdriver.Chrome(
            executable_path=DRIVER_PATH, options=options)
        driver = drivers[threading.current_thread().name]

    driver.get(url)

    table = driver.find_element_by_xpath('//*[@id="totals"]/tbody')

    try:
        all_star = driver.find_element_by_xpath('//*[@id="all_star"]/tbody')
        years = [
            year.text for year in all_star.find_elements_by_xpath('./tr/th')
        ]
    except:
        years = []

    season_totals = []
    #Get season totals by row
    for row in table.find_elements_by_xpath('./tr'):

        season = [td.text for td in row.find_elements_by_xpath("./td")]
        season.insert(0, row.find_element_by_xpath('./th').text)

        if len(season) == 32:
            season.pop(30)
        elif len(season) == 30:
            season.append('0')

        if season[0] in years:
            season.append('Yes')
        else:
            season.append('No')

        season.insert(0, names)

        for i in range(len(season)):
            if season[i] == '':
                season[i] = '0'
        season_totals.append(season)
    #Check if all star

    #Write to file
    with open('players_total.csv', 'a', newline='', encoding='utf-8') as file:
        write = csv.writer(file)
        write.writerows(season_totals)
Exemplo n.º 7
0
def stock_crawler(url):
    driver_path = mac_chromedriver_path

    options = Options()
    options.page_load_strategy = 'normal'
    options.add_argument('--headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("lang=ko_KR")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/81.0.4044.122 Safari/537.36")

    options.add_argument('--log-level=3')
    options.add_argument('--disable-loggin')
    options.add_argument(mac_user_chrome_data_path)
    driver = webdriver.Chrome(executable_path=driver_path, options=options)

    driver.get(url)
    driver.execute_script(
        "Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5]}})"
    )
    driver.execute_script(
        "Object.defineProperty(navigator, 'languages', {get: function() {return ['ko-KR', 'ko']}})"
    )
    driver.execute_script(
        "const getParameter = WebGLRenderingContext.getParameter;WebGLRenderingContext.prototype.getParameter = function(parameter) {if (parameter === 37445) {return 'NVIDIA Corporation'} if (parameter === 37446) {return 'NVIDIA GeForce GTX 980 Ti OpenGL Engine';}return getParameter(parameter);};"
    )

    time.sleep(5)

    if url is ssg:
        el_stock = driver.find_elements_by_css_selector('#oriCart')
        print(el_stock)

        el_stock = el_stock[0].get_attribute('outerHTML')
        print(el_stock)
        if 'soldout' in el_stock:
            # sendChannelMsg(f'현재 ssg에 닌텐도 스위치 동물의 숲 에디션 재고가 없습니다.\n다시 확인하고 알려드릴게요!\n{url}')
            pass
        else:
            print('ssg 재고 있음')
    elif url is coupang:
        el_stock = driver.find_elements_by_css_selector(
            "#contents > div.prod-atf > div > div.prod-buy.sold-out.new-oos-style.not-loyalty-member.eligible-address.without-subscribe-buy-type.DISPLAY_0.only-one-delivery > div.prod-price-container"
        )
        el_stock = el_stock[0].get_attribute('outerHTML')
        if '일시품절' in el_stock:
            # sendChannelMsg(f'현재 coupang에 닌텐도 스위치 동물의 숲 에디션 재고가 없습니다.\n다시 확인하고 알려드릴게요!\n{url}')
            pass
        else:
            print('coupang 재고 있음')

        driver.quit()
def get_webdriver_options(output_dir):
    options = Options()
    options.page_load_strategy = 'normal'
    options.add_argument("--disable-notifications")
    options.add_argument("--disable-popup-blocking")
    prefs = {
        "profile.default_content_settings.popups": 0,
        'download.directory_upgrade': True,
        "download.default_directory": os.path.join(os.getcwd(), output_dir)
    }
    options.add_experimental_option("prefs", prefs)
    return options
Exemplo n.º 9
0
def getWebDriver(args):
    """
    Get the appopiate driver for chosen browser
    """
    driver = None
    if args.chrome:
        options = ChromeOptions()
        options.page_load_strategy = 'eager'
        # Default profile directory
        userDataDir = os.getenv(
            'LOCALAPPDATA') + "\\Google\\Chrome\\User Data" if args.profile == None else args.profile
        options.add_argument("user-data-dir=" + userDataDir)
        driver = Chrome(options=options)

    elif args.firefox:
        options = FirefoxOptions()
        options.page_load_strategy = 'eager'
        # Default profile directory
        profiles = os.listdir(os.getenv('APPDATA') +
                              "\\Mozilla\\Firefox\\Profiles\\")
        # "xxxxxxxx.default-release" is the default profile for Release versions v67+
        default_profile = next(
            profile for profile in profiles if profile[-15:] == "default-release")
        userDataDir = os.getenv('APPDATA')+"\\Mozilla\\Firefox\\Profiles\\" + \
            default_profile if args.profile == None else args.profile
        fp = FirefoxProfile(userDataDir)
        driver = Firefox(fp, options=options)

    elif args.edge:
        options = EdgeOptions()
        options.page_load_strategy = 'eager'
        options.use_chromium = True
        # Default profile directory
        userDataDir = os.getenv(
            'LOCALAPPDATA') + "\\Microsoft\\Edge\\User Data" if args.profile == None else args.profile
        options.add_argument("user-data-dir=" + userDataDir)
        driver = Edge(options=options)

    return driver
Exemplo n.º 10
0
    def __init__(self, id, database_creds, ng):
        self.cnxn = pyodbc.connect(database_creds.get_connectioN_string(),
                                   autocommit=True)
        self.ng = ng

        pid = os.getpid()

        cursor = self.cnxn.cursor()
        self.save_cursor = self.cnxn.cursor()
        option = Options()

        option.add_argument("--disable-infobars")

        option.add_argument("--disable-gpu")
        option.add_argument("--start-maximized")
        #   option.add_argument("--headless")
        #   option.add_argument("--window-size=1024,768")
        option.add_argument("--disable-extensions")
        option.add_argument("--disable-translate")
        option.add_argument("--allow-file-access-from-files")
        option.add_argument("--disable-dev-shm-usage")
        option.page_load_strategy = 'eager'

        # option.add_argument("--enable-usermedia-screen-capturing")
        # option.add_argument("--use-fake-ui-for-media-stream")
        # option.add_argument("--use-fake-device-for-media-stream")
        #  option.add_argument("--use-fake-ui-device-for-media-stream")
        #  option.add_argument("--use-file-for-fake-video-capture=C:\\temp\\bunnyvideo.mjpeg")
        #   option.add_argument("--use-file-for-fake-audio-capture=C:\\temp\\bunny.opus")
        option.add_argument("--enable-tracing")
        #  option.add_argument("--enable-tracing-output = c:\\temp\\log.txt")

        # Pass the argument 1 to allow and 2 to block
        #  option.add_experimental_option("prefs", {
        #     "profile.default_content_setting_values.notifications": 2
        #  })
        option.set_capability('unhandledPromptBehavior', 'accept')

        secret = Secret()

        self.driver = webdriver.Remote(command_executor=secret.URL,
                                       desired_capabilities={
                                           "browserName": "chrome",
                                       },
                                       options=option)

        self.step = Test()
        self.session = session.Session()
        self.master_id = id

        self.run()
Exemplo n.º 11
0
def conectarWeb():
    global driver
    options = Options()
    options.page_load_strategy = 'eager'
    driver = webdriver.Chrome('/usr/bin/chromedriver', options=options)
    driver.maximize_window()

    time.sleep(1)
    driver.get(link)

    ###quitar cookie
    WebDriverWait(driver, 5)\
        .until(EC.element_to_be_clickable((By.XPATH,
                                          '/html/body/div/p[2]/a')))\
        .click()
    return driver
def get_default_driver_options(width=1472,
                               height=828,
                               headless=True) -> Options:
    """
    Generate default Chrome driver options
    :param width: int
    :param height: int
    :param headless: bool
    :return: Options
    """

    chrome_options = Options()
    chrome_options.headless = headless
    chrome_options.page_load_strategy = 'normal'

    chrome_options.add_argument('--enable-automation'),
    chrome_options.add_argument('--start-maximized'),
    chrome_options.add_argument(f'--window-size={width},{height}'),
    chrome_options.add_argument('--lang=en-GB'),
    chrome_options.add_argument('--no-sandbox'),
    chrome_options.add_argument('--disable-setuid-sandbox'),
    chrome_options.add_argument('--disable-gpu'),
    chrome_options.add_argument('--disable-dev-shm-usage'),
    chrome_options.add_argument('--no-sandbox'),
    chrome_options.add_argument('--disable-setuid-sandbox'),
    chrome_options.add_argument('--disable-dev-shm-usage'),
    chrome_options.add_argument("-proxy-server='direct://"),
    chrome_options.add_argument('--proxy-bypass-list=*'),
    chrome_options.add_argument('--disable-accelerated-2d-canvas'),
    chrome_options.add_argument('--disable-gpu'),
    chrome_options.add_argument('--allow-running-insecure-content'),
    chrome_options.add_argument('--disable-web-security'),
    chrome_options.add_argument('--disable-client-side-phishing-detection'),
    chrome_options.add_argument('--disable-notifications'),
    chrome_options.add_argument('--mute-audio'),

    # Disable downloads
    chrome_options.add_experimental_option(
        'prefs', {
            'safebrowsing.enabled': 'false',
            'download.prompt_for_download': False,
            'download.default_directory': '/dev/null',
            'download_restrictions': 3,
            'profile.default_content_setting_values.notifications': 2,
        })

    return chrome_options
Exemplo n.º 13
0
def create_driver_instance():
    user_agent = random.choice(user_agent_list)
    chrome_options = Options()
    chrome_options.add_argument('headless')
    chrome_options.add_argument("window-size=1024,768")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-plugins")
    chrome_options.add_argument("--disable-images")
    chrome_options.add_argument('blink-settings=imagesEnabled=false')
    chrome_options.add_argument(f'user-agent={user_agent}')
    chrome_options.add_argument('log-level=3')
    chrome_options.page_load_strategy = 'EAGER'
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)  # Chrome
    driver.set_page_load_timeout(5000)
    driver.set_script_timeout(5000)
    return driver
Exemplo n.º 14
0
def wikiscrap():
    options = Options()
    options.page_load_strategy = 'none'

    driver = webdriver.Chrome(executable_path='chromedriver', options=options)
    driver.get('https://en.wikipedia.org/wiki/List_of_Indian_dishes')

    # Finding all the names of foods from a wikipedia page
    foodname = driver.find_elements_by_xpath("//tbody/tr/td[position()=1]")

    # Rejecting empty cells
    foodlist = [make_tags(x.text) for x in foodname if x.text != '']

    print("No of names scrapped =", len(foodlist))

    driver.quit()

    food = pd.DataFrame(foodlist, columns=['food_name'])
    food.to_csv('food.csv')
Exemplo n.º 15
0
def html_parse(value):
    options = Options()
    options.page_load_strategy = 'eager'
    boo = True
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    driver.get('https://soundcloud.com/' + value + '/tracks')
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    if soup.find('title').text == "Something went wrong on SoundCloud":
        boo = False
        soup = None

    while (boo):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        if soup.find('div', class_='paging-eof') is None:
            driver.execute_script(
                "window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(0.5)
        else:
            boo = False
    return soup
Exemplo n.º 16
0
 def get_driver(self):
     if self.use_custom_urls:
         self.url = self.custom_urls
     if self.settings.use_chrome:
         options = Options()
         #options.binary_location(executable_path=self.settings.custom_chrome_exe_path)
         options.add_argument('--disable-dev-shm-usage')
         #options.binary_location = self.settings.custom_chrome_exe_path
         options.page_load_strategy = "eager"
         options.add_experimental_option("excludeSwitches", ["enable-automation"])
         options.add_experimental_option("useAutomationExtension", False)
         #options.add_argument('--no-proxy-server')
         #options.add_argument("--proxy-server='direct://'")
         #options.add_argument("--proxy-bypass-list=*")
         #options.add_argument('--blink-settings=imagesEnabled=false')
         #options.add_argument("--no-sandbox");
         if not self.load_images:
             prefs = {"profile.managed_default_content_settings.images": 2}
             options.add_experimental_option("prefs", prefs)
         if self.headless_mode:
             options.add_argument('--headless')
             options.add_argument('--no-proxy-server')
         if self.settings.use_chrome_profile:
             options.add_argument("--user-data-dir={}".format(self.settings.chrome_profile_path)) # .profile-bb
             options.add_argument('--profile-directory=Default')
         if self.settings.custom_chrome_exe_path != "" or None:
             #driver = webdriver.Chrome(self.settings.custom_chrome_exe_path, options=options)
             driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
         else:
             driver = webdriver.Chrome(options=options)
     else:
         fireFoxOptions = webdriver.FirefoxOptions()
         firefox_profile = webdriver.FirefoxProfile(self.settings.custom_firefox_exe_path)
         if self.headless_mode:
             fireFoxOptions.set_headless()
         elif not self.load_images:
             firefox_profile.set_preference('permissions.default.image', 2)
             firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
         driver = webdriver.Firefox(firefox_profile=firefox_profile, firefox_options=fireFoxOptions)
     return driver
Exemplo n.º 17
0
    def make_options(self, headless=True, proxing=False):
        '''Конфигурирование браузера'''
        dict_options = {}
        options = Options()
        if headless:
            options.add_argument("--headless")
        if self.binary_location:
            options.binary_location = (self.binary_location)
        options.page_load_strategy = 'eager'
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-gpu')
        options.add_argument('user-agent={}'.format(load_ua()))
        dict_options['options'] = options

        if proxing:
            prox = Proxy()
            prox.proxy_type = ProxyType.MANUAL
            prox.http_proxy = 'socks5://127.0.0.1:9050'
            prox.ssl_proxy = 'socks5://127.0.0.1:9050'
            capabilities = webdriver.DesiredCapabilities.CHROME
            prox.add_to_capabilities(capabilities)
            dict_options['desired_capabilities'] = capabilities
        return dict_options
Exemplo n.º 18
0
 def __init__(self, environment: Environment, headless: bool):
     chrome_options = Options()
     self.headless = headless
     chrome_options.headless = self.headless
     # workaround for the first page being way to slow to load
     # ~2 minutes for my case (caused by some useless element being slow?)
     chrome_options.page_load_strategy = "eager"
     super().__init__(options=chrome_options)
     self.environment = environment
     self.start_time = None
     self.command_executor._commands["SEND_COMMAND"] = (
         "POST", "/session/$sessionId/chromium/send_command")
     self.execute(
         "SEND_COMMAND",
         dict(
             cmd="Network.emulateNetworkConditions",
             params={
                 "offline": False,
                 "latency": 100,
                 "downloadThroughput": 50000,
                 "uploadThroughput": 50000
             },
         ),
     )
Exemplo n.º 19
0
def main():
    crawler = ReviewCrawler()
    while True:
        try:
            crawler.start_idx = crawler.get_start_idx()
            crawler.crawl_from(crawler.start_idx)
            print('finish?')
            break
        except Exception as e:
            print(e)
            print('timeout error: stop')
            time.sleep(2)
            crawler.driver.quit()
            options = Options()
            options.page_load_strategy = 'eager'
            crawler.driver = webdriver.Chrome(
                options=options,
                executable_path="/Users/ssamko/Downloads/chromedriver")
            crawler.driver.set_window_position(-10000, 0)
            crawler.start_idx = crawler.get_start_idx()
            if crawler.start_idx == -1:
                print('finish well')
                break
            continue
Exemplo n.º 20
0
BEST_BUY_ADD_TO_CART_API_URL = "https://www.bestbuy.com/cart/api/v1/addToCart"
BEST_BUY_CHECKOUT_URL = "https://www.bestbuy.com/checkout/c/orders/{order_id}/"
WEBHOOK_URL = "https://discordapp.com/api/webhooks/778423506294931478/rFcPl55WCL_y-ucIpEBekQ08xoXJhVw5GjqkoXZLPvP3PwRbTCc9y7mhdGHH6dmS-IxO"

DEFAULT_HEADERS = {
    "accept":
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
    "user-agent": settings.userAgent,
    "origin": "https://www.bestbuy.com",
}

options = Options()
options.page_load_strategy = "eager"
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
options.add_argument("user-data-dir=.profile-bb")


class BestBuy:
    def __init__(self, task_id, status_signal, image_signal, product, profile,
                 proxy, monitor_delay, error_delay):
        self.task_id, self.status_signal, self.image_signal, self.product, self.profile, self.monitor_delay, self.error_delay = task_id, status_signal, image_signal, product, profile, float(
            monitor_delay), float(error_delay)
        self.sku_id = parse.parse_qs(parse.urlparse(
            self.product).query)['skuId'][0]
        self.session = requests.Session()
Exemplo n.º 21
0
        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
import config

conf = config.Config()
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
mongo = MongoClient(f"mongodb://{conf.MONGO_REMOTE_IP}:27017")
db = mongo['aircode']
review_col = db['nsmall_reviews']
prod_col = db['nsmall']
today = datetime.date.today()
prod_list = list(prod_col.find({'reg_date': str(today)}))
print(len(prod_list))
review_dataset = []

options = Options()
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(
    options=options, executable_path="/Users/ssamko/Downloads/chromedriver")

if not os.path.exists(os.path.join(BASE_DIR, f'daily')):
    os.mkdir(os.path.join(BASE_DIR, f'daily'))

daily_index_dir = os.path.join(BASE_DIR, f'daily/{today}')

if not os.path.exists(daily_index_dir):
    os.mkdir(daily_index_dir)


def runtimer(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
Exemplo n.º 22
0
import fitz

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import numpy as np

url = 'https://www.ey.com/en_gl/tax-alerts'

# chrome webdriver options
options = Options()
options.page_load_strategy = 'normal'
options.add_argument("--start-maximized")
options.add_argument('--disable-extensions')

#%%
# I will get the full list of countries from EY webpage

driver = webdriver.Chrome(options=options)
driver.get(url)

time.sleep(2)

# clicking in "I decline optional cookies"
try:
    WebDriverWait(driver, 10)\
        .until(EC.presence_of_element_located((By.XPATH, '//*[@id="cookiePolicy"]/div/div[2]/button[1]')))\
Exemplo n.º 23
0
def main():
    print('(webdrv) main:')
    print()

    urlbase = 'https://kroger-gcm.semi.cashstar.com'
    submalls = ('', 'bakersplus', 'city-market', 'dillons', 'food-4-less',
                'foods-co', 'fred-meyer', 'frys-food', 'gerbes', 'jay-c-foods',
                'king-soopers', 'marianos', 'metro-market', 'payless',
                'pick-n-save', 'qfc', 'ralphs', 'smiths', 'kroger')

    # todo: any user input
    #
    # service = Service('/usr/local/bin/chromedriver')
    # service.start()

    options = Options()
    options.page_load_strategy = 'eager'
    driver = webdriver.Chrome(options=options)
    # driver = webdriver.Remote(service.service_url)

    # submall = None
    # driver.get("http://www.google.com")
    # # submall_validation_data = submall_validation(driver, submall)
    # time.sleep(5)

    submall = ""
    driver.get(urlbase)
    submall_validation_data = submall_validation(driver, submall)
    time.sleep(5)

    for submall in submalls:
        url = urlbase + f'/{submall}'
        driver.get(url)
        submall_validation_data = submall_validation(driver, submall)
        time.sleep(3)

    # submall = "bakersplus"
    # driver.get("https://kroger-gcm.semi.cashstar.com/bakersplus")
    # submall_validation_data = submall_validation(driver, submall)
    # time.sleep(5)
    #
    # submall = "city-market"
    # driver.get("https://kroger-gcm.semi.cashstar.com/city-market")
    # submall_validation_data = submall_validation(driver, submall)
    # time.sleep(5)
    #
    # submall = "dillons"
    # driver.get("https://kroger-gcm.semi.cashstar.com/dillons")
    # submall_validation_data = submall_validation(driver, submall)
    # time.sleep(5)
    #
    # submall = "kroger"
    # driver.get("https://kroger-gcm.semi.cashstar.com/kroger")
    # submall_validation_data = submall_validation(driver, submall)
    # time.sleep(5)

    driver.quit()

    print()
    print('(webdrv) end::')

    return 0
Exemplo n.º 24
0
 def load_chromedriver(self, path):
     chrome_options = Options()
     chrome_options.page_load_strategy = 'eager'
     return webdriver.Chrome(path, options=chrome_options)
Exemplo n.º 25
0
def main():
    options = Options()
    options.add_experimental_option("detach", True)

    # 不加载图片
    prefs = {"profile.managed_default_content_settings.images": 2}
    options.add_experimental_option("prefs", prefs)

    # Page loading strategy
    options.page_load_strategy = 'none'
    """
    Page loading strategy: 

    normal - This will make Selenium WebDriver to wait for the entire page is loaded. When set to normal, Selenium WebDriver waits until the load event fire is returned.
    By default normal is set to browser if none is provided.

    eager - This will make Selenium WebDriver to wait until the initial HTML document has been completely loaded and parsed, and discards loading of stylesheets, images and subframes.

    none - When set to none Selenium WebDriver only waits until the initial page is downloaded.

    """

    #设置成用户自己的数据目录: "chrome://version/" # (原用户参数为复制以为"Default",另复制一份为"DefaultCopy"程序才不会报错)
    # options.add_argument('--user-data-dir=C:\\Users\\Caviar\\AppData\\Local\\Temp\\scoped_dir3768_856195622\\DefaultCopy')

    # options.binary_location指定360极速浏览器路径
    options.binary_location = 'C:\\Users\\Caviar\\AppData\\Local\\360Chrome\\Chrome\\Application\\360chrome.exe'

    # 创建Chrome驱动实例,executable_path指定chromedriver路径
    driver = webdriver.Chrome(
        options=options,
        executable_path=
        'C:\\Users\\Caviar\\AppData\\Local\\360Chrome\\Chrome\\Application\\chromedriver.exe'
    )

    # 最小化浏览器
    #driver.minimize_window()

    for url_season in l_season:

        dict_season = {}
        josn_fn = url_season.split('/')[-2]
        print([josn_fn])

        #driver ---- class WebDriver(selenium.webdriver.remote.webdriver.WebDriver)
        driver.get(url_season)

        # 经过浏览器渲染以及标签清洗过后的HTML页面
        html_season_ = driver.page_source
        #'''
        with open('经过浏览器渲染以及标签清洗过后的HTML页面.html', 'w', encoding='utf-8') as f:
            f.write(html_season_)
        #'''
        html_season = etree.HTML(html_season_)
        # 每集描述

        for a in html_season.xpath(
                '//*[@id="default-page"]/div/div/div/div/figure/ul/li/figure/a'
        ):
            #print(a)
            #print([etree.tounicode(a)])
            #print()

            try:
                figcaption = a.xpath('./following-sibling::*[1]')[0]
                img_episode = a.xpath('./img/@src')[0]
                url_episode = a.xpath('./@href')[0]

                #print(etree.tounicode(figcaption))
                strong_eng = figcaption.xpath('./div/a[1]/div/strong')[0]
                strong_chn = figcaption.xpath('./div/a[1]/strong')[0]
                discription_eng = strong_eng.text
                discription_chn = strong_chn.text

                # 进入二级网页
                # 如果二级页面使用requests从服务器直接获取的HTML源数据标签规范,那么可以直接使用requests访问,抓取效率更高
                driver.get(url_episode)
                #driver.set_page_load_timeout(time_to_wait=10)
                driver.implicitly_wait(time_to_wait=0)
                html_episode_ = driver.page_source
                html_episode = etree.HTML(html_episode_)
                a_mp4 = html_episode.xpath(
                    '//*[@id="default-page"]/div/div/div/div/div[1]/div/div/section[2]/div/div/div[1]/div/div/div/div/div/a'
                )[0]
                mp4_url = a_mp4.xpath('./@href')[0]
                print(mp4_url)

                d = {mp4_url: [discription_eng, discription_chn, img_episode]}
                dict_season.update(d)
                print(d)
            except:
                #### 将python_data写入data.json文件
                with open(f'{josn_fn}【无】.txt', 'w', encoding='utf-8') as f:
                    f.write('')
                break

        if dict_season != {}:
            #### 将python_data写入data.json文件
            with open(f'{josn_fn}.json', 'w', encoding='utf-8') as f:
                json.dump(dict_season,
                          f,
                          sort_keys=True,
                          indent=4,
                          ensure_ascii=False)
        else:
            pass
Exemplo n.º 26
0
def search_site(website, product="mens slim jeans"):
    ''' opens website with selenium's find elem '''

    url = str(website)
    searchfor = str(product)

    # getting the iphone content version (hopefully less content)
    headers = {
        'user-agent':
        'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1'
    }

    options = Options()
    options.page_load_strategy = 'eager'  # waits only til html is loaded and parsed, no stylesheets, images etc

    username = os.path.expanduser('~')
    # only on mac, if windows -> change the path to C:\\
    download_path = username + '/Downloads/chromedriver'
    browser = webdriver.Chrome(download_path, options=options)

    browser.get(url)
    # browser.set_page_load_timeout(10)

    product_price = {}

    # this is super inefficient and annoying to copypasta if you know of a better way Let me know!
    # If there is a search box on the site - I am assuming it is the first input element or form element.. not perfect but makes sense, right?
    # attempt at every possible way a website may have named its search box !!
    # try:
    #     browser.find_element_by_name('search')
    #     searchbox = browser.find_element_by_name('search')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)
    # except selenium.common.exceptions.NoSuchElementException:
    #     pass

    # elif browser.find_element_by_name('search-input'):
    #     searchbox = browser.find_element_by_name('search-input')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('searchbox'):
    #     searchbox = browser.find_element_by_name('searchbox')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('search-box'):
    #     searchbox = browser.find_element_by_name('search-box')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('searchbar'):
    #     searchbox = browser.find_element_by_name('searchbox')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('search-bar'):
    #     searchbox = browser.find_element_by_name('search-bar')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('input'):
    #     searchbox = browser.find_element_by_name('input')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('keyword'):
    #     searchbox = browser.find_element_by_name('keyword')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('key-word'):
    #     searchbox = browser.find_element_by_name('key-word')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('key-words'):
    #     searchbox = browser.find_element_by_name('key-words')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('keywords'):
    #     searchbox = browser.find_element_by_name('keywords')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('keywordSearch'):
    #     searchbox = browser.find_element_by_name('keywordSearch')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('globalSearchInputField'):
    #     searchbox = browser.find_element_by_name('globalSearchInputField')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('globalSearch'):
    #     searchbox = browser.find_element_by_name('globalSearchInput')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('globalSearchInput'):
    #     searchbox = browser.find_element_by_name('globalSearchInput')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_name('globalSearchField'):
    #     searchbox = browser.find_element_by_name('globalSearchField')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_id('search'):
    #     searchbox = browser.find_element_by_id('search')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_id('search-input'):
    #     searchbox = browser.find_element_by_id('search-input')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_id('searchbox'):
    #     searchbox = browser.find_element_by_id('searchbox')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_id('searchbar'):
    #     searchbox = browser.find_element_by_id('searchbar')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_id('input'):
    #     searchbox = browser.find_element_by_id('input')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_id('globalSearchInputField'):
    #     searchbox = browser.find_element_by_id('globalSearchInputField')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_id('globalSearch'):
    #     searchbox = browser.find_element_by_id('globalSearch')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_id('globalSearchInput'):
    #     searchbox = browser.find_element_by_id('globalSearchInput')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_id('globalSearchField'):
    #     searchbox = browser.find_element_by_id('globalSearchField')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_tag_name('input'):
    #     searchbox = browser.find_element_by_tag('input')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_tag_name('form'):
    #     searchform = browser.find_element_by_tag_name('form')
    #     searchbox = searchform.find_element_by_tag_name('input')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_class_name('globalSearchInputField'):
    #     searchbox = browser.find_element_by_class_name('globalSearchInputField')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_class_name('keyword'):
    #     searchbox = browser.find_element_by_class_name('keyword')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_class_name('keywords'):
    #     searchbox = browser.find_element_by_class_name('keywords')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_class_name('search'):
    #     searchbox = browser.find_element_by_class_name('search')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_class_name('searchbox'):
    #     searchbox = browser.find_element_by_class_name('searchbox')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_class_name('searchbar'):
    #     searchbox = browser.find_element_by_class_name('searchbar')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_class_name('input'):
    #     searchbox = browser.find_element_by_class_name('input')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_css_selector('p.search'):
    #     searchbox = browser.find_element_by_css_selector('p.search')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_css_selector('p.searchbox'):
    #     searchbox = browser.find_element_by_css_selector('p.searchbox')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_css_selector('p.searchbar'):
    #     searchbox = browser.find_element_by_css_selector('p.searchbar')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_css_selector('p.input'):
    #     searchbox = browser.find_element_by_css_selector('p.input')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_css_selector('p.search-input'):
    #     searchbox = browser.find_element_by_css_selector('p.search-input')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # elif browser.find_element_by_css_selector('p.keywords'):
    #     searchbox = browser.find_element_by_css_selector('p.keyword')
    #     searchbox.send_keys(searchfor)
    #     searchbox.send_keys(Keys.ENTER)

    # FUCKIT ignores errors and moves on with the code.
    # try else would have taken me too long but are effectively the same as fuckit module

    with fuckit:

        # TIMING TO AVOID BOT DENIAL ?? time.sleep(2)
        browser.find_element_by_name('search')
        searchbox = browser.find_element_by_name('search')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)

        # options2 = webdriver.ChromeOptions()
        # options2.add_argument('--ignore-certificate-errors')
        # options2.add_argument("--test-type")
        # options2.binary_location = "/usr/bin/chromium"
        # browser.save_screenshot('screenshot.png', chrome_options = options2)

        # do i need to reassign the new url? guess not
        print('current url', browser.current_url)

        # tag = browser.find_elements_by_tag_name('p')
        # tag2 = browser.find_elements_by_tag_name('a')

        # tag3 = browser.find_elements_by_class_name('prod_price')
        # tag4 = browser.find_elements_by_tag_name('span')

        # browser.find_elements_by_class_name('productPrice')
        # browser.find_elements_by_class_name('product-Price')
        # browser.find_elements_by_class_name('productprice')
        # browser.find_elements_by_class_name('Price')
        # price = browser.find_elements_by_class_name('price')
        # price2 = browser.find_elements_by_id('price')

        # browser.find_elements_by_class_name('prod_price')
        # browser.find_elements_by_class_name('prod_Price')

        # for item in price:
        #     print(item.text)

        # for item in tag4:
        #     print(item.text)

        # SOUP scrape
        page_response = requests.get(browser.current_url, headers=headers)
        page_content = bs(page_response.content, "html.parser")
        print(page_content.prettify)
        ul_lists = page_content.find_all('div')
        # print(ul_lists)

        # REGEX TRIAL
        # pattern = re.compile(r'[0-9][0-9][1-9]\.[0-9][0-9]?')
        # match = pattern.findall(str(page_content))

        # i=0
        # for m in match:
        #     if i < 10:
        #         print(m)
        #         i += 1

        # floats = re.findall("\d+\.\d+", str(page_content))
        # for num in floats:
        #     print(num)
        #     print(num.text)

        # If-statement after search() tests if it succeeded
        # if match:
        #     print('found', match.group()) ## 'found word:cat'
        # else:
        #     print('did not find')

        # browser.find_elements_by_name('price')
        # browser.find_elements_by_name('product')
        # browser.find_elements_by_name('product-price')
        # browser.find_elements_by_name('productprice')

        # browser.find_elements_by_class_name('price')
        # browser.find_elements_by_class_name('product')
        # browser.find_elements_by_class_name('product-price')
        # browser.find_elements_by_class_name('productprice')

        # browser.find_elements_by_id('price')
        # browser.find_elements_by_id('product')
        # browser.find_elements_by_id('product-price')
        # browser.find_elements_by_id('productprice')

        browser.quit()

    with fuckit:
        searchbox = browser.find_element_by_name('search-input')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:
        searchbox = browser.find_element_by_name('searchbox')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:
        searchbox = browser.find_element_by_name('search-box')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:
        searchbox = browser.find_element_by_name('searchbox')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:
        searchbox = browser.find_element_by_name('search-bar')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('input')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('keyword')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('key-word')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('key-words')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('keywords')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('keywordSearch')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('globalSearchInputField')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('globalSearchInput')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('globalSearchInput')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_name('globalSearchField')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_id('search')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_id('search-input')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_id('searchbox')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_id('searchbar')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_id('input')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_id('globalSearchInputField')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_id('globalSearch')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_id('globalSearchInput')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_id('globalSearchField')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_tag('input')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchform = browser.find_element_by_tag_name('form')
        searchbox = searchform.find_element_by_tag_name('input')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_class_name(
            'globalSearchInputField')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_class_name('keyword')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_class_name('keywords')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_class_name('search')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_class_name('searchbox')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_class_name('searchbar')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    with fuckit:

        searchbox = browser.find_element_by_class_name('input')
        searchbox.send_keys(searchfor)
        searchbox.send_keys(Keys.ENTER)
        print(browser.current_url)

    browser.quit()
    return 'website'
Exemplo n.º 27
0
    "httpProxy": PROXY,
    "ftpProxy": PROXY,
    "sslProxy": PROXY,
    "proxyType": "MANUAL",

}

with webdriver.Firefox() as driver:
    # Open URL
    driver.get("https://selenium.dev")

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.page_load_strategy = 'normal'
driver = webdriver.Chrome(options=options)
# Navigate to url
driver.get("http://www.google.com")
driver.quit()

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options)
# Navigate to url
driver.get("http://www.google.com")
driver.quit()
Exemplo n.º 28
0
def main(start_i=0):
    conf = config.Config()

    options = Options()
    options.page_load_strategy = 'eager'
    driver = webdriver.Chrome(
        options=options,
        executable_path="/Users/ssamko/Downloads/chromedriver")
    # print(conf.MONGO_REMOTE_IP)

    # mongo = MongoClient(f"mongodb://localhost:27017")
    mongo = MongoClient(f"mongodb://{conf.MONGO_REMOTE_IP}:27017")
    db = mongo['aircode']
    col = db['hmall_prod']

    today = datetime.date.today()

    with open(f'crawler/hmall/daily/{today}.txt', 'r') as f:
        urls = f.readlines()
        for url in urls[start_i:]:
            # print(url)
            # data = requests.get(url)
            # soup = BeautifulSoup(data.text, 'html.parser')
            driver.get(url)
            WebDriverWait(driver, timeout=5).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, 'div.pdtCode')))

            # with open('crawler/hmall/page_sample.txt','w') as sample:
            #     sample.write(data.text)
            try:
                # prod_id = soup.select_one('div.pdtCode > span:nth-child(1)').text.split(":")[-1].strip()
                prod_id = driver.find_element_by_css_selector(
                    'div.pdtCode > span:nth-child(1)').text.split(
                        ":")[-1].strip()
                prod_id = int(prod_id)
                print(prod_id)
                # prod_name = soup.select_one('h3.pdtTitle').text.strip()
                prod_name = driver.find_element_by_css_selector(
                    'h3.pdtTitle').text.strip()
                # print(prod_name)
                # price = soup.select_one('p.finalPrice.number.hasDC > strong').text.strip()
                price = driver.find_element_by_css_selector(
                    'p.finalPrice.number.hasDC > strong').text.strip()
                # print(price)
            except Exception as e:
                print(e)
                continue
            try:
                # score = soup.select_one('em.scoreMount').text.strip()
                score = driver.find_element_by_css_selector(
                    'em.scoreMount').text.strip()
                # print(score)
                # score_persons = soup.select_one('p.scoreNum').text[1:].split()[0]
                score_persons = driver.find_element_by_css_selector(
                    'p.scoreNum').text[1:].split()[0]
                # print(score_persons)
            except Exception as e:
                print(e)
                print('No score exists')
                score = None
                score_persons = 0
            # img_url = soup.select_one('#prd_ipzoom > img')['src']
            #prd_ipzoom > div._frm_magnifier > div > img
            img_url = driver.find_element_by_css_selector(
                '#prd_ipzoom > div._frm_magnifier > div > img').get_attribute(
                    'src')
            # prd_ipzoom > div._frm_magnifier > div > img
            # print(img_url)
            # 3 types of img path
            # t1 = soup.select('#guidance > table > tbody > tr > td > p')
            t1 = driver.find_elements_by_css_selector(
                '#guidance > table > tbody > tr > td > p')
            # t2 = soup.select('#deal_unit_d1 > dt > p')
            t2 = driver.find_elements_by_css_selector('#deal_unit_d1 > dt > p')
            #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(1)
            # t3 = soup.select('#section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p')
            t3 = driver.find_elements_by_css_selector(
                '#section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p'
            )

            if t1:
                # print("t1 >>",t1)
                detail_imgs_p = t1
            elif t2:
                # print("t2 >>",t2)
                detail_imgs_p = t2
            elif t3:
                # print("t3 >>",t3)
                detail_imgs_p = t3
            else:
                print(t1, t2, t3)
                print('weird type error')
                continue
            detail_urls = []
            for p in detail_imgs_p:
                detail_img = p.find_elements_by_css_selector('img')
                if not detail_img:
                    # print('not img p')
                    continue
                # print('detail >>', detail_img['src'])
                # detail_img_url = detail_img['src']
                # print(type(detail_img_url))
                while detail_img:
                    detail_urls.append(detail_img.pop().get_attribute('src'))
                # print(detail_urls)
            if not detail_urls:
                print('no detail img')
            # print(detail_urls[0][0][0][0][0])
            #['src']
            #guidance > table > tbody > tr > td > p:nth-child(1) > img
            #guidance > table > tbody > tr > td > p:nth-child(2) > img
            #guidance > table > tbody > tr > td > p:nth-child(2) > img
            #guidance > table > tbody > tr > td > p:nth-child(6) > img
            #guidance > table > tbody > tr > td > p:nth-child(2) > img
            #guidance > table > tbody > tr > td > p:nth-child(4) > img
            #guidance > table > tbody > tr > td > p:nth-child(2) > img
            #deal_unit_d1 > dt > p:nth-child(5) > img
            #deal_unit_d1 > dt > p:nth-child(3) > img
            #deal_unit_d1 > dt > p:nth-child(2) > img
            #deal_unit_d1 > dt > p:nth-child(7) > img
            #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(5) > img
            #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(5) > img
            #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(3) > img
            #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(2) > img
            #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(1) > img
            #section_cont_1 > div.prod_detail_view.open > div > table > tbody > tr > td > p:nth-child(3) > img

            db_data = {
                'prod_id': prod_id,
                'prod_name': prod_name,
                'price': price,
                'score': score,
                'score_persons': score_persons,
                'img_url': img_url,
                'detail_img_url': detail_urls,
                'reg_date': str(today)
            }
            # print(db_data)
            col.insert_one(db_data)
Exemplo n.º 29
0
    "proxyType": "MANUAL",
}

with webdriver.Chrome() as driver:
    driver.get("xxx.com")

# 页面加载策略
"""
document.readyState 属性描述当前页面的加载状态. 默认情况下, 在页面就绪状态是 complete 之前,
WebDriver都将延迟 driver.get() 的响应或 driver.navigate().to() 的调用
"""

# normal默认加载策略
# WebDriver等待整个页面的加载,设置为normal时,WebDriver保持等待直到返回load事件
options = Options()
options.page_load_strategy = "normal"
browser = webdriver.Chrome(options=options)
browser.get("xxx.com")
browser.quit()

# eager加载策略
# WebDriver保持等待并直到完全加载并解析了html文件,忽略css样式表、图片和subframes的加载
# 设置为eager时,保持等待直到返回DOMContentLoaded事件
options.page_load_strategy = "eager"

# none加载策略
# WebDriver仅等待至初始页面下载完成

# find element用于查找网页元素
# 从网页元素q中查找searchbox元素
search_box = browser.find_element_by_name("q")
Exemplo n.º 30
0
    def jsHtmlLoader(self,
                     url,
                     scroll=False,
                     scroll_num=3,
                     click=False,
                     click_num=3,
                     click_path='',
                     turn=False,
                     waitTime=1):
        '''Use selenium to get the fully loaded html fiel
        
        Wait and return the html content after a certain amount
        
        Args:
            url: str, rootURL of the root page
            
        Returns:
            html: str, fully loaded html with all ajax content
        '''

        # get and set up proxy
        proxy_ip = self.get_proxy_ip()

        # check if the proxy is available
        # aval = self.check_proxy_ip(proxy_ip)
        # if aval:
        #     proxy = f'http://{proxy_ip}'
        #     try:
        #         webdriver.DesiredCapabilities.CHROME['proxy'] = {
        #             "httpProxy": proxy,
        #             "proxyType": "MANUAL",
        #         }
        #         print(f'set up proxy for js loader: {proxy}')
        #     except:
        #         print('unable to set up proxy for js loader...')

        if proxy_ip != None:
            try:
                webdriver.DesiredCapabilities.CHROME['proxy'] = {
                    "httpProxy": f'http://{proxy_ip}',
                    'httpsProxy': f'https://{proxy_ip}',
                    "proxyType": "MANUAL",
                }
                print(f'Set up proxy for js loader: {proxy}')
            except:
                print('Uable to set up proxy for js loader...')

        # Set up headless Chrome browser
        options = Options()
        # options.add_argument('--headless')
        options.page_load_strategy = 'eager'
        prefs = {
            'profile.default_content_setting_values': {
                'images': 2,
            }
        }
        options.add_experimental_option('prefs', prefs)
        driver = webdriver.Chrome(options=options)

        # Request, wait and return the fully loaded html file
        driver.get(url)
        print('Waiting js to load...')
        time.sleep(waitTime)

        html = driver.page_source

        # If the pagination is to scroll, scroll the page then
        if scroll:
            self.scroll_after_request(driver, scroll_num)
            html = driver.page_source.encode('utf-8')

        # If the pagination is to click, click the page for 3 times default
        if click:
            if not turn:
                self.click_after_request(driver, click_num, click_path)
                html = driver.page_source.encode('utf-8')
            if turn:
                html = driver.page_source.encode('utf-8')
                self.click_after_request(driver, 1, click_path)
                next_page_url = driver.current_url

        # Close after several tries
        # if waitTime > 3:
        # driver.close()

        # Return the page or load the html again
        if html != None:
            # driver.close()
            return html if not turn else (html, next_page_url)
        else:
            print('Failing to load js...trying again\n')
            # driver.close()
            return self.jsHtmlLoader(url, waitTime + 0.5)