def discover_urls_for_category(cls, category, extra_args=None): base_url = 'https://www.exito.com' url_extensions = [ ['Tecnologia-Celulares_y_accesorios-Accesorios_para_celular-' 'Almacenamiento/_/N-2fzn', 'MemoryCard'], ['Tecnologia-Computadores-_impresoras_y_tablets-' 'Accesorios_de_computador-Memorias/_/N-2gbg', 'ExternalStorageDrive'], ] product_urls = [] with HeadlessChrome() as driver: driver.get(base_url) for url_extension, local_category in url_extensions: if local_category != category: continue catalog_url = base_url + '/browse/' + url_extension + \ '?No=0&Nrpp=80' print(catalog_url) driver.get(catalog_url) base_soup = BeautifulSoup(driver.page_source, 'html.parser') link_containers = base_soup.findAll('div', 'product') for link_container in link_containers: url = base_url + link_container.find('a')['href'] url = url.replace('?nocity', '') product_urls.append(url) return product_urls
def products_for_url(cls, url, category=None, extra_args=None): print(url) with HeadlessChrome() as driver: driver.get('https://www.exito.com/') driver.get(url) soup = BeautifulSoup(driver.page_source, 'html5lib') part_number = soup.find( 'div', 'reference').text.replace( 'REF:', '').strip()[:49] name = soup.find('h1', 'name').text.strip() sku = soup.find('div', 'product')['id'][3:] description = '' for panel in soup.findAll('div', 'tabs-pdp')[:-1]: description += html_to_markdown(str(panel)) + '\n\n' picture_urls = [tag['data-src'] for tag in soup.find( 'div', {'id': 'slide-image-pdp'}).findAll('img')] price_container = soup.find('div', 'col-data').find( 'span', 'money') if price_container: price = Decimal(price_container.text.replace(',', '')) stock = -1 else: stock = 0 price = Decimal(0) p = Product( name, cls.__name__, category, url, url, sku, stock, price, price, 'COP', sku=sku, part_number=part_number, description=description, picture_urls=picture_urls ) return [p]
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', filename='discover_urls_for_categories.log', filemode='w') parser = argparse.ArgumentParser( description='Generates the Cloudflare cookies for certain stores') parser.add_argument('proxy', type=str, help='Proxy to use') args = parser.parse_args() proxy = args.proxy with HeadlessChrome(proxy=proxy, headless=False, images_enabled=True) \ as driver: driver.get('https://simple.ripley.cl') input('Please complete the CAPTCHA, then press ENTER') cfduid_cookie = None cf_clearance_cookie = None cf_clearance_expiration = None for cookie in driver.get_cookies(): if cookie['name'] == '__cfduid': cfduid_cookie = cookie['value'] print('__cfduid', cookie) if cookie['name'] == 'cf_clearance': cf_clearance_cookie = cookie['value'] cf_clearance_expiration = cookie['expiry'] assert cfduid_cookie and cf_clearance_cookie print('Use the following parameters as "extra args" for scraping') print( 'Cookie expires on:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(cf_clearance_expiration))) d = { "proxy": proxy, "cf_clearance": cf_clearance_cookie, "__cfduid": cfduid_cookie } print(json.dumps(d))
def discover_urls_for_category(cls, category, extra_args=None): category_filters = [ ('electro-y-tecnologia/electronica/televisores', 'Television'), ('electro-y-tecnologia/electrohogar/lavadoras-y-secadoras', 'WashingMachine'), ('electro-y-tecnologia/tecnologia/smartwatch', 'Wearable'), # ('electro-y-tecnologia/tecnologia/celulares', 'Cell'), # ('electro-y-tecnologia/electrohogar/refrigeradores', # 'Refrigerator'), # ('electro-y-tecnologia/electrohogar/cocina-y-microondas', # 'Oven'), # ('electro-y-tecnologia/electrodomesticos/electro-cocina', # 'Oven'), # ('electro-y-tecnologia/electrodomesticos/aspiradoras', # 'VacuumCleaner'), # ('electro-y-tecnologia/electronica/parlantes', 'StereoSystem') ] product_urls = [] with HeadlessChrome() as driver: for url_extension, local_category in category_filters: if local_category != category: continue url = 'https://store.jumbo.cl/{}?PS=100&sc=20'.format( url_extension) print(url) driver.get(url) soup = BeautifulSoup(driver.page_source, 'html.parser') product_containers = soup.findAll('div', 'box-product') if not product_containers: raise Exception('Empty section: ' + url) for container in product_containers: product_url = container.find('a')['href'] product_urls.append(product_url + '?sc=20') return product_urls
def banners(cls, extra_args=None): base_url = 'https://www.falabella.com/falabella-cl/{}' sections_data = [ [bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, ''], # # CATEGORY PAGES # # # Currently displaying a smart picker [ bs.REFRIGERATION, 'Electrohogar-Refrigeradores', bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'category/cat3205/Refrigeradores' ], [ bs.WASHING_MACHINES, 'Electrohogar-Lavado', bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'category/cat3136/Lavado' ], [ bs.TELEVISIONS, 'TV', bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'category/cat1012/TV' ], [ bs.AUDIO, 'Audio', bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'category/cat2005/Audio' ], [ bs.CELLS, 'Telefonía-Celulares y Teléfonos', bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'category/cat2018/Celulares-y-Telefonos' ], # # MOSAICS ## [ bs.LINEA_BLANCA_FALABELLA, 'Electro y Tecnología-Línea Blanca', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat7090035/Linea-Blanca?isPLP=1' ], [ bs.REFRIGERATION, 'Refrigeradores-No Frost', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat4074/No-Frost' ], [ bs.REFRIGERATION, 'Refrigeradores-Side by Side', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat4091/Side-by-Side' ], # [bs.WASHING_MACHINES, 'Lavadoras', bs.SUBSECTION_TYPE_MOSAIC, # 'category/cat3136/Lavadoras '], [ bs.WASHING_MACHINES, 'Lavadoras-Lavadoras', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat4060/Lavadoras' ], [ bs.WASHING_MACHINES, 'Lavadoras-Lavadoras-Secadoras', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat1700002/Lavadoras-Secadoras' ], [ bs.WASHING_MACHINES, 'Lavadoras-Secadoras', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat4088/Secadoras' ], [ bs.WASHING_MACHINES, ' Lavadoras-Lavadoras Doble Carga', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat11400002/Lavadoras-Doble-Carga' ], [ bs.TELEVISIONS, 'Tecnología-TV', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat1012/TV?isPLP=1' ], [ bs.TELEVISIONS, 'Televisores LED', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat7190148/Televisores-LED' ], [ bs.TELEVISIONS, 'LEDs menores a 50 pulgadas', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat11161614/LEDs-menores-a-50-pulgadas' ], [ bs.TELEVISIONS, 'LEDs entre 50 - 55 pulgadas', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat11161675/LEDs-entre-50---55-pulgadas' ], [ bs.TELEVISIONS, 'LEDs sobre 55 pulgadas', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat11161679/LEDs-sobre-55-pulgadas' ], # [bs.TELEVISIONS, 'TV-LED', bs.SUBSECTION_TYPE_MOSAIC, # 'category/cat2850014/LED'], # [bs.TELEVISIONS, 'TV-Smart TV', bs.SUBSECTION_TYPE_MOSAIC, # 'category/cat3040054/Smart-TV'], # [bs.TELEVISIONS, 'TV-4K UHD', bs.SUBSECTION_TYPE_MOSAIC, # 'category/cat3990038/4K-UHD'], # [bs.TELEVISIONS, 'TV-Televisores OLED', # bs.SUBSECTION_TYPE_MOSAIC, # 'category/cat2850016/Televisores-OLED'], # [bs.TELEVISIONS, 'TV-Pulgadas Altas', # bs.SUBSECTION_TYPE_MOSAIC, # 'category/cat12910024/Televisores-LED-Desde-65"'], [ bs.AUDIO, 'Audio-Soundbar y Home Theater', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat2045/Home-Theater' ], [ bs.AUDIO, 'Home Theater', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat3050040/Home-Theater' ], [ bs.AUDIO, 'Soundbar', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat1700004/Soundbar' ], [ bs.AUDIO, 'Minicomponente', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat70018/Minicomponente' ], [ bs.AUDIO, 'Audio-Equipos de Música y Karaokes', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat3091/?mkid=CA_P2_MIO1_024794' ], [ bs.AUDIO, 'Audio-Hi-Fi', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat3203/Hi-Fi' ], [ bs.AUDIO, 'Audio', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat2005/Audio?isPLP=1' ], [ bs.CELLS, 'Smartphones', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat720161/Smartphones' ], [ bs.CELLS, 'Electro y Tecnología-Teléfonos', bs.SUBSECTION_TYPE_MOSAIC, 'category/cat2018/Telefonos?isPLP=1' ], ] banners = [] proxy = None if 'proxy' in extra_args: proxy = extra_args['proxy'] for section, subsection, subsection_type, url_suffix in sections_data: url = base_url.format(url_suffix) if subsection_type == bs.SUBSECTION_TYPE_HOME: with HeadlessChrome(images_enabled=True, proxy=proxy) as driver: driver.set_window_size(1920, 1080) driver.get(url) images = driver\ .find_element_by_class_name('swiper-container')\ .find_elements_by_class_name('dy_unit')[1:-1] index = 1 for image_url in images: picture_array = image_url.find_element_by_tag_name( 'picture').find_elements_by_tag_name('source') destination_urls = [ d.get_property('href') for d in image_url.find_elements_by_tag_name('a') ] destination_urls = list(set(destination_urls)) for picture in picture_array: picture_url = picture.get_property('srcset').split( ' ')[0] if 'https://www.falabella.com' not in picture_url: picture_url = 'https://www.falabella.com' \ '{}'.format(picture_url) if picture_url: banners.append({ 'url': url, 'picture_url': picture_url, 'destination_urls': destination_urls, 'key': picture_url, 'position': index, 'section': section, 'subsection': subsection, 'type': subsection_type }) break else: raise Exception( 'No valid banners found for {} in position ' '{}'.format(url, index + 1)) index += 1 elif subsection_type == bs.SUBSECTION_TYPE_CATEGORY_PAGE: with HeadlessChrome(images_enabled=True, proxy=proxy, timeout=99) as driver: driver.set_window_size(1920, 1080) driver.get(url) pictures = [] try: pips_container = driver.find_element_by_class_name( 'fb-hero-carousel__pips') driver.execute_script( "arguments[0].setAttribute('style', " "'display:block !important;');", pips_container) elements = driver.find_element_by_class_name( 'fb-hero-carousel__pips')\ .find_elements_by_class_name( 'fb-hero-carousel__pips__pip') for element in elements: element.click() time.sleep(2) image_url = Image.open( BytesIO(driver.get_screenshot_as_png())) image_url = image_url.crop((0, 187, 1920, 769)) buffered = BytesIO() image_url.save(buffered, format='PNG') pictures.append( base64.b64encode(buffered.getvalue())) except NoSuchElementException: image_url = Image.open( BytesIO(driver.get_screenshot_as_png())) image_url = image_url.crop((0, 187, 1920, 769)) buffered = BytesIO() image_url.save(buffered, format='PNG') pictures.append(base64.b64encode(buffered.getvalue())) soup = BeautifulSoup(driver.page_source, 'html.parser') images_div = soup.findAll('div', 'fb-hero-carousel-slide') images_article = soup.findAll('article', 'fb-hero-carousel-slide') images_module = soup.findAll('div', 'hero fb-module-wrapper') images = images_div + images_article + images_module assert len(images) == len(pictures) for index, image_url in enumerate(images): picture_array = image_url.findAll( 'picture')[-1].findAll('source') destination_urls = [ d['href'] for d in image_url.findAll('a') ] destination_urls = list(set(destination_urls)) for picture in picture_array: key = picture['srcset'].split(' ')[0] if 'https' not in key: key = 'https://www.falabella.com' + key if 'webp' not in key: banners.append({ 'url': url, 'picture': pictures[index], 'destination_urls': destination_urls, 'key': key, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) break else: raise Exception( 'No valid banners found for {} in position ' '{}'.format(url, index + 1)) elif subsection_type == bs.SUBSECTION_TYPE_MOSAIC: session = session_with_proxy(extra_args) session.headers['user-agent'] = 'curl/7.64.1' soup = BeautifulSoup(session.get(url).text, 'html.parser') banner = soup.find('div', 'fb-huincha-main-wrap') if not banner: continue image_url = banner.find('source')['srcset'] dest_url = banner.find('a')['href'] banners.append({ 'url': url, 'picture_url': image_url, 'destination_urls': [dest_url], 'key': image_url, 'position': 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) return banners
def banners(cls, extra_args=None): base_url = 'https://www.abcdin.cl/{}' sections_data = [ [bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, ''], [ bs.LINEA_BLANCA_ABCDIN, 'Línea Blanca AbcDin', bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'tienda/es/abcdin/linea-blanca' ], [ bs.TELEVISIONS, 'Electro', bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'tienda/es/abcdin/tv-audio' ], [ bs.CELLS, 'Telefonía', bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'tienda/es/abcdin/celulares' ], [ bs.REFRIGERATION, 'Refrigeradores', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/linea-blanca/refrigeradores' ], [ bs.REFRIGERATION, 'Refrigeradores No Frost', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/linea-blanca/refrigeradores/' 'refrigeradores-no-frost' ], [ bs.REFRIGERATION, 'Refrigeradores Side by Side', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/linea-blanca/refrigeradores/' 'refrigeradores-side-by-side' ], [ bs.WASHING_MACHINES, 'Lavado y Secado', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/linea-blanca/lavado-secado' ], [ bs.WASHING_MACHINES, 'Lavadoras', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/linea-blanca/lavado-secado/lavadoras' ], [ bs.WASHING_MACHINES, 'Lavadoras-Secadoras', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/linea-blanca/lavado-secado/' 'lavadoras-secadoras' ], [ bs.TELEVISIONS, 'Electro', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/tv-audio' ], [ bs.TELEVISIONS, 'Televisores LED', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/tv-audio/televisores-video/televisores-led' ], [ bs.AUDIO, 'Audio', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/tv-audio/audio' ], [ bs.AUDIO, 'Minicomponentes', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/tv-audio/audio/minicomponentes' ], [ bs.AUDIO, 'Home Theater', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/tv-audio/audio/home-theater' ], [ bs.CELLS, 'Smartphones', bs.SUBSECTION_TYPE_MOSAIC, 'tienda/es/abcdin/celulares/smartphones' ] ] session = session_with_proxy(extra_args) banners = [] for section, subsection, subsection_type, url_suffix in sections_data: url = base_url.format(url_suffix) print(url) if subsection_type == bs.SUBSECTION_TYPE_HOME: with HeadlessChrome(images_enabled=True) as driver: driver.set_window_size(1920, 1080) driver.get(url) time.sleep(10) elements = driver.find_elements_by_class_name( 'slide-static') controls = driver\ .find_element_by_class_name('pageControl')\ .find_elements_by_tag_name('a') assert len(elements) == len(controls) for index, element in enumerate(elements): modal_button = driver \ .find_elements_by_class_name('close-modal') if modal_button: modal_button[0].click() time.sleep(2) control = controls[index] control.click() time.sleep(2) picture = element.screenshot_as_base64 key_container = element\ .value_of_css_property('background-image') key = re.search(r'url\("(.*?)"\)', key_container)\ .group(1) try: destination_urls = [ element.find_element_by_tag_name( 'a').get_attribute('href') ] except NoSuchElementException: destination_urls = [] banners.append({ 'url': url, 'picture': picture, 'destination_urls': destination_urls, 'key': key, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) elif subsection_type == bs.SUBSECTION_TYPE_CATEGORY_PAGE: # STATIC BANNER response = session.get(url) soup = BeautifulSoup(response.text, 'html.parser') banner = soup.find('a', {'data-type': 'huincha'}) if banner: picture_url = banner.find('img')['src'] destination_urls = [ 'https://www.abcdin.cl' + banner['href'] ] banners.append({ 'url': url, 'picture_url': picture_url, 'destination_urls': destination_urls, 'key': picture_url, 'position': 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) # CAROUSEL elements = soup.findAll('div', 'homeHero') for index, element in enumerate(elements): picture_url = element.find('img')['src'] url_suffix = element.find('a') if not url_suffix: destination_urls = [] else: destination_urls = [ 'https://www.abcdin.cl' + url_suffix['href'] ] banners.append({ 'url': url, 'picture_url': picture_url, 'destination_urls': destination_urls, 'key': picture_url, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) elif subsection_type == bs.SUBSECTION_TYPE_MOSAIC: response = session.get(url) soup = BeautifulSoup(response.text, 'html.parser') banner = soup.find('a', {'data-type': 'huincha'}) if not banner: banner = soup.find('div', 'homeHero') if banner: banner = banner.find('a') if banner: picture_url = banner.find('img')['src'] destination_urls = [ 'https://www.abcdin.cl' + banner['href'] ] banners.append({ 'url': url, 'picture_url': picture_url, 'destination_urls': destination_urls, 'key': picture_url, 'position': 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) return banners
def get_owl_banners(cls, url, section, subsection, subsection_type, extra_args): extra_args = extra_args or {} proxy = extra_args.get('proxy', None) with HeadlessChrome(images_enabled=True, timeout=240, proxy=proxy, headless=True) as driver: banners = [] driver.set_window_size(1920, 1080) # Open the page first so that the CF cookies can be loaded in # this domain # Then set the sesion cookies if 'cf_clearance' in extra_args: driver.get(url) load_driver_cf_cookies(driver, extra_args, '.ripley.cl') driver.get(url) else: driver.get(url) driver.execute_script("scrollTo(0, 0);") pictures = [] banner_container = driver \ .find_element_by_class_name('owl-carousel') retries = 10 for i in range(retries): print('Retry {} for owl banner'.format(i + 1)) time.sleep(10) controls = banner_container \ .find_elements_by_class_name('owl-page') if controls: break else: raise Exception('Timeout waiting for owl banners: ' + url) for control in controls: control.click() time.sleep(1) pictures.append( banner_container.screenshot_as_base64) images = banner_container.find_elements_by_class_name('owl-item') assert len(images) == len(pictures) for index, image in enumerate(images): try: image_style = image.find_element_by_tag_name( 'span').get_attribute('style') key = re.search(r'url\((.*?)\)', image_style) \ .group(1) except NoSuchElementException: key = image.find_element_by_tag_name( 'source').get_attribute('srcset') destinations = image.find_elements_by_tag_name('a') destination_urls = [a.get_attribute('href') for a in destinations] destination_urls = list(set(destination_urls)) destination_urls = list(set(destination_urls)) banners.append({ 'url': url, 'picture': pictures[index], 'destination_urls': destination_urls, 'key': key, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) return banners
def preflight(cls, extra_args=None): # Obtain Cloudflare bypass cookie if extra_args is None: raise Exception("extra_args should contain the parameters to " "obtain the Cloudflare session cookie or the " "'debug' flag if testing locally") if 'PROXY_USERNAME' not in extra_args: return {} proxy = 'http://{}:{}@{}:{}'.format( extra_args['PROXY_USERNAME'], extra_args['PROXY_PASSWORD'], extra_args['PROXY_IP'], extra_args['PROXY_PORT'], ) with HeadlessChrome(images_enabled=False, proxy=proxy, headless=True) as driver: driver.get('https://simple.ripley.cl') soup = BeautifulSoup(driver.page_source, 'html.parser') hcaptcha_script_tag = soup.find('script', {'data-type': 'normal'}) website_key = hcaptcha_script_tag['data-sitekey'] # Anti captcha request request_body = { "clientKey": extra_args['KEY'], "task": { "type": "HCaptchaTask", "websiteURL": "https://simple.ripley.cl/", "websiteKey": website_key, "proxyType": "http", "proxyAddress": extra_args['PROXY_IP'], "proxyPort": extra_args['PROXY_PORT'], "proxyLogin": extra_args['PROXY_USERNAME'], "proxyPassword": extra_args['PROXY_PASSWORD'], "userAgent": CF_REQUEST_HEADERS['User-Agent'] } } print('Sending anti-captcha task') print(json.dumps(request_body, indent=2)) anticaptcha_session = requests.Session() anticaptcha_session.headers['Content-Type'] = 'application/json' response = json.loads(anticaptcha_session.post( 'http://api.anti-captcha.com/createTask', json=request_body).text) print('Anti-captcha task request response') print(json.dumps(response, indent=2)) assert response['errorId'] == 0 task_id = response['taskId'] print('TaskId', task_id) # Wait until the task is ready... get_task_result_params = { "clientKey": extra_args['KEY'], "taskId": task_id } print( 'Querying for anti-captcha response (wait 10 secs per retry)') print(json.dumps(get_task_result_params, indent=4)) retries = 1 hcaptcha_response = None while not hcaptcha_response: if retries > 60: raise Exception('Failed to get a token in time') print('Retry #{}'.format(retries)) time.sleep(10) res = json.loads(anticaptcha_session.post( 'https://api.anti-captcha.com/getTaskResult', json=get_task_result_params).text) assert res['errorId'] == 0, res assert res['status'] in ['processing', 'ready'], res if res['status'] == 'ready': print('Solution found') hcaptcha_response = res['solution']['gRecaptchaResponse'] break retries += 1 print(hcaptcha_response) for field in ['g-recaptcha-response', 'h-captcha-response']: driver.execute_script("document.querySelector('[name=\"" "{0}\"]').remove(); " "var foo = document.createElement('" "input'); foo.setAttribute('name', " "'{0}'); " "foo.setAttribute('value','{1}'); " "document.getElementsByTagName('form')" "[0].appendChild(foo);".format( field, hcaptcha_response)) driver.execute_script("document.getElementsByTagName('form')" "[0].submit()") d = { "proxy": proxy, "cf_clearance": driver.get_cookie('cf_clearance')['value'], "__cfduid": driver.get_cookie('__cfduid')['value'] } return d
def get_owl_banners(cls, url, section, subsection, type, extra_args): with HeadlessChrome(images_enabled=True, timeout=60, proxy=extra_args['proxy']) as driver: print(url) banners = [] driver.set_window_size(1920, 1080) driver.set_page_load_timeout(240) # Open the page first so that the CF cookies can be loaded in # this domain driver.get(url) # Then set the sesion cookies load_driver_cf_cookies(driver, extra_args, '.ripley.cl') # Then re-open the page driver.get(url) driver.execute_script("scrollTo(0, 0);") pictures = [] banner_container = driver \ .find_element_by_class_name('owl-carousel') controls = banner_container \ .find_elements_by_class_name('owl-page') for control in controls: control.click() time.sleep(1) pictures.append( banner_container.screenshot_as_base64) images = banner_container.find_elements_by_class_name('owl-item') assert len(images) == len(pictures) for index, image in enumerate(images): try: image_style = image.find_element_by_tag_name( 'span').get_attribute('style') key = re.search(r'url\((.*?)\)', image_style) \ .group(1) except NoSuchElementException: key = image.find_element_by_tag_name( 'source').get_attribute('srcset') destinations = image.find_elements_by_tag_name('a') destination_urls = [a.get_attribute('href') for a in destinations] destination_urls = list(set(destination_urls)) destination_urls = list(set(destination_urls)) banners.append({ 'url': url, 'picture': pictures[index], 'destination_urls': destination_urls, 'key': key, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': type }) return banners
def banners(cls, extra_args=None): sections_data = [[ bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, 'https://www.lapolar.cl/' ]] session = session_with_proxy(extra_args) banners = [] for section, subsection, subsection_type, url in sections_data: response = session.get(url) soup = BeautifulSoup(response.text, 'html.parser') if subsection_type == bs.SUBSECTION_TYPE_HOME: with HeadlessChrome(images_enabled=True) as driver: driver.set_window_size(1920, 1080) driver.get(url) pictures = [] banner_container = driver.find_element_by_class_name( 'slick-list') controls = driver\ .find_element_by_class_name('slick-dots')\ .find_elements_by_tag_name('li') for control in controls: control.click() time.sleep(2) pictures.append(banner_container.screenshot_as_base64) soup = BeautifulSoup(driver.page_source, 'html.parser') images = soup.find('div', 'slick-track')\ .findAll('div', 'slick-slide') images = [ a for a in images if 'slick-cloned' not in a['class'] ] assert len(images) == len(pictures) for index, image in enumerate(images): key = None key_options = image.findAll('img', 'responsive_prod') destination_urls = [ d['href'] for d in image.findAll('a') ] destination_urls = list(set(destination_urls)) for key_option in key_options: if 'llamado_logo_img' in key_option['class']: continue key = key_option['src'] break if not key: key = destination_urls[0] banners.append({ 'url': url, 'picture': pictures[index], 'destination_urls': destination_urls, 'key': key, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) elif subsection_type == bs.SUBSECTION_TYPE_CATEGORY_PAGE: iframe = soup.find('iframe', 'full') if iframe: content = session.get(iframe['src']) soup = BeautifulSoup(content.text, 'html.parser') picture_base_url = 'https://www.lapolar.cl{}' else: picture_base_url = url + '{}' images = soup.findAll('div', 'swiper-slide') if not images: images = soup.findAll('div', 'item') for index, image, in enumerate(images): picture = image.find('picture') if not picture: picture_url = picture_base_url.format( image.find('img')['src']) else: picture_url = picture_base_url.format( image.findAll('source')[-1]['srcset']) destination_urls = [image.find('a')['href']] banners.append({ 'url': url, 'picture_url': picture_url, 'destination_urls': destination_urls, 'key': picture_url, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) return banners
def banners(cls, extra_args=None): base_url = 'https://www.hites.com/{}' sections_data = [ [bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, ''], [bs.TELEVISIONS, 'TV Video', bs.SUBSECTION_TYPE_MOSAIC, 'tecnologia/tv-video'], [bs.TELEVISIONS, 'Todos los Led', bs.SUBSECTION_TYPE_MOSAIC, 'tecnologia/tv-video/todos-los-led'], [bs.TELEVISIONS, 'Smart TV Hasta 50', bs.SUBSECTION_TYPE_MOSAIC, 'tecnologia/tv-video/smart-tv-hasta-50'], [bs.TELEVISIONS, 'Smart TV Entre 55 y 60', bs.SUBSECTION_TYPE_MOSAIC, 'tecnologia/tv-video/smart-tv-entre-55-y-60'], [bs.TELEVISIONS, 'Smart TV Desde 65', bs.SUBSECTION_TYPE_MOSAIC, 'tecnologia/tv-video/smart-tv-desde-65'], [bs.CELLS, 'Smartphone', bs.SUBSECTION_TYPE_MOSAIC, 'celulares/smartphone'], [bs.CELLS, 'Smartphone-Smartphone', bs.SUBSECTION_TYPE_MOSAIC, 'celulares/smartphone/smartphone'], [bs.CELLS, 'Smartphone Liberados', bs.SUBSECTION_TYPE_MOSAIC, 'celulares/smartphone/smartphone-liberados'], [bs.REFRIGERATION, 'Refrigeradores', bs.SUBSECTION_TYPE_MOSAIC, 'electro-hogar/refrigeradores'], [bs.REFRIGERATION, 'No Frost', bs.SUBSECTION_TYPE_MOSAIC, 'electro-hogar/refrigeradores/no-frost'], [bs.REFRIGERATION, 'Side by Side', bs.SUBSECTION_TYPE_MOSAIC, 'electro-hogar/refrigeradores/side-by-side'], [bs.WASHING_MACHINES, 'Lavado y Secado', bs.SUBSECTION_TYPE_MOSAIC, 'electro-hogar/lavado-y-secado'], [bs.WASHING_MACHINES, 'Lavadoras', bs.SUBSECTION_TYPE_MOSAIC, 'electro-hogar/lavado-y-secado/lavadoras'], [bs.WASHING_MACHINES, 'Lavadoras-Secadoras', bs.SUBSECTION_TYPE_MOSAIC, 'electro-hogar/lavado-y-secado/lavadoras-secadoras'], # [bs.WASHING_MACHINES, 'Secadoras', bs.SUBSECTION_TYPE_MOSAIC, # 'electro-hogar/lavado-y-secado/secadoras'], [bs.AUDIO, 'Audio', bs.SUBSECTION_TYPE_MOSAIC, 'tecnologia/audio'], # [bs.AUDIO, 'Minicomponentes', bs.SUBSECTION_TYPE_MOSAIC, # 'tecnologia/audio/minicomponentes'], # [bs.AUDIO, 'Soundbar y Home Theater', bs.SUBSECTION_TYPE_MOSAIC, # 'tecnologia/audio/soundbar-y-home-theater'] ] session = session_with_proxy(extra_args) banners = [] for section, subsection, subsection_type, url_suffix in sections_data: url = base_url.format(url_suffix) print(url) if subsection_type == bs.SUBSECTION_TYPE_HOME: with HeadlessChrome(images_enabled=True, timeout=120) as driver: driver.set_window_size(1920, 1080) driver.get(url) pictures = [] banner_container = driver\ .find_element_by_class_name('slick-list') # banner_container = driver \ # .find_element_by_class_name('owl-stage-outer') controls = driver.find_element_by_class_name( 'slick-dots')\ .find_elements_by_tag_name('li') # controls = driver.find_elements_by_class_name('owl-dot') for control in controls: control.click() time.sleep(1) pictures.append( banner_container.screenshot_as_base64) soup = BeautifulSoup(driver.page_source, 'html.parser') images = soup.find('div', 'slick-track')\ .findAll('div', 'slick-slide') # images = soup.find('div', 'owl-stage') \ # .findAll('div', 'owl-item') images = [a for a in images if 'slick-cloned' not in a['class']] # images = [a for a in images if # 'cloned' not in a['class']] assert len(images) == len(pictures) for index, image in enumerate(images): product_box = image.find('div', 'boxproductos') if not product_box: product_box = image.find('div', 'box-producto') if not product_box: product_box = image.find('div', 'box-foto') if not product_box: product_box = image.find( 'div', 'slide-new__products') if not product_box: product_box = image.find('div', 'images_llamados') if not product_box: product_box = image.find( 'div', 'products-item__img') if not product_box: product_box = image.find('a', 'boxproducto') if not product_box: product_box = image if not (product_box.find('source') or product_box.find('img')): product_box = image.find('div', 'img_boxproducto') if not product_box: product_box = image.find('div', 'logocampana') key_container = product_box.find('source') if key_container: key = key_container['srcset'] else: key = product_box.find('img')['src'] destinations = [d for d in image.findAll('a')] destination_urls = [] for destination in destinations: if destination.get('href'): destination_urls.append(destination['href']) destination_urls = list(set(destination_urls)) banners.append({ 'url': url, 'picture': pictures[index], 'destination_urls': destination_urls, 'key': key, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) elif subsection_type == bs.SUBSECTION_TYPE_MOSAIC: response = session.get(url) soup = BeautifulSoup(response.text, 'html.parser') banners_container = soup.find('section')\ .findAll('div', 'espot', recursive=False) for index, banner in enumerate(banners_container): destination_urls = [d['href'] for d in banner.findAll('a')] destination_urls = list(set(destination_urls)) picture_container = banner.find('picture') if picture_container: picture_source = picture_container.find('source') if not picture_source: continue picture_url = picture_source['srcset'] banners.append({ 'url': url, 'picture_url': picture_url, 'destination_urls': destination_urls, 'key': picture_url, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) else: with HeadlessChrome(images_enabled=True, timeout=120) \ as driver: driver.set_window_size(1920, 1080) driver.get(url) s_banner = driver.find_elements_by_css_selector( '#main>.espot')[index] key_container = banner.find('img') if not key_container or \ s_banner.size['height'] == 0: continue key = key_container['src'] picture = s_banner.screenshot_as_base64 banners.append({ 'url': url, 'picture': picture, 'destination_urls': destination_urls, 'key': key, 'position': index + 1, 'section': section, 'subsection': subsection, 'type': subsection_type }) return banners
def _session_driver(cls, extra_args): time.sleep(1) # Browser initialization driver = HeadlessChrome(headless=True).driver driver.get('https://cl.ingrammicro.com/_layouts/' 'CommerceServer/IM/Login.aspx') retries = 1 while retries < 5: if driver.find_elements_by_id('okta-signin-username'): break time.sleep(1) retries += 1 driver.find_element_by_id('okta-signin-username').send_keys( extra_args['username']) driver.find_element_by_id('okta-signin-password').send_keys( extra_args['password']) driver.find_element_by_id('okta-signin-submit').click() time.sleep(10) return driver