示例#1
0
    def discover_urls_for_category(cls, category, extra_args=None):
        base_url = 'https://www.exito.com'

        url_extensions = [
            ['Tecnologia-Celulares_y_accesorios-Accesorios_para_celular-'
             'Almacenamiento/_/N-2fzn', 'MemoryCard'],
            ['Tecnologia-Computadores-_impresoras_y_tablets-'
             'Accesorios_de_computador-Memorias/_/N-2gbg',
             'ExternalStorageDrive'],
        ]

        product_urls = []

        with HeadlessChrome() as driver:
            driver.get(base_url)

            for url_extension, local_category in url_extensions:
                if local_category != category:
                    continue

                catalog_url = base_url + '/browse/' + url_extension + \
                    '?No=0&Nrpp=80'
                print(catalog_url)
                driver.get(catalog_url)
                base_soup = BeautifulSoup(driver.page_source, 'html.parser')

                link_containers = base_soup.findAll('div', 'product')

                for link_container in link_containers:
                    url = base_url + link_container.find('a')['href']
                    url = url.replace('?nocity', '')
                    product_urls.append(url)

            return product_urls
示例#2
0
    def products_for_url(cls, url, category=None, extra_args=None):
        print(url)
        with HeadlessChrome() as driver:
            driver.get('https://www.exito.com/')
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, 'html5lib')

            part_number = soup.find(
                'div', 'reference').text.replace(
                'REF:', '').strip()[:49]

            name = soup.find('h1', 'name').text.strip()
            sku = soup.find('div', 'product')['id'][3:]

            description = ''
            for panel in soup.findAll('div', 'tabs-pdp')[:-1]:
                description += html_to_markdown(str(panel)) + '\n\n'

            picture_urls = [tag['data-src'] for tag in soup.find(
                'div', {'id': 'slide-image-pdp'}).findAll('img')]

            price_container = soup.find('div', 'col-data').find(
                'span', 'money')
            if price_container:
                price = Decimal(price_container.text.replace(',', ''))
                stock = -1
            else:
                stock = 0
                price = Decimal(0)

            p = Product(
                name,
                cls.__name__,
                category,
                url,
                url,
                sku,
                stock,
                price,
                price,
                'COP',
                sku=sku,
                part_number=part_number,
                description=description,
                picture_urls=picture_urls
            )

            return [p]
def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s',
                        filename='discover_urls_for_categories.log',
                        filemode='w')

    parser = argparse.ArgumentParser(
        description='Generates the Cloudflare cookies for certain stores')

    parser.add_argument('proxy', type=str, help='Proxy to use')

    args = parser.parse_args()
    proxy = args.proxy

    with HeadlessChrome(proxy=proxy, headless=False, images_enabled=True) \
            as driver:
        driver.get('https://simple.ripley.cl')
        input('Please complete the CAPTCHA, then press ENTER')

        cfduid_cookie = None
        cf_clearance_cookie = None
        cf_clearance_expiration = None

        for cookie in driver.get_cookies():
            if cookie['name'] == '__cfduid':
                cfduid_cookie = cookie['value']
                print('__cfduid', cookie)
            if cookie['name'] == 'cf_clearance':
                cf_clearance_cookie = cookie['value']
                cf_clearance_expiration = cookie['expiry']

        assert cfduid_cookie and cf_clearance_cookie

        print('Use the following parameters as "extra args" for scraping')
        print(
            'Cookie expires on:',
            time.strftime('%Y-%m-%d %H:%M:%S',
                          time.localtime(cf_clearance_expiration)))

        d = {
            "proxy": proxy,
            "cf_clearance": cf_clearance_cookie,
            "__cfduid": cfduid_cookie
        }
        print(json.dumps(d))
示例#4
0
    def discover_urls_for_category(cls, category, extra_args=None):
        category_filters = [
            ('electro-y-tecnologia/electronica/televisores', 'Television'),
            ('electro-y-tecnologia/electrohogar/lavadoras-y-secadoras',
             'WashingMachine'),
            ('electro-y-tecnologia/tecnologia/smartwatch', 'Wearable'),
            # ('electro-y-tecnologia/tecnologia/celulares', 'Cell'),
            # ('electro-y-tecnologia/electrohogar/refrigeradores',
            #  'Refrigerator'),
            # ('electro-y-tecnologia/electrohogar/cocina-y-microondas',
            # 'Oven'),
            # ('electro-y-tecnologia/electrodomesticos/electro-cocina',
            # 'Oven'),
            # ('electro-y-tecnologia/electrodomesticos/aspiradoras',
            #  'VacuumCleaner'),
            # ('electro-y-tecnologia/electronica/parlantes', 'StereoSystem')
        ]

        product_urls = []

        with HeadlessChrome() as driver:
            for url_extension, local_category in category_filters:
                if local_category != category:
                    continue

                url = 'https://store.jumbo.cl/{}?PS=100&sc=20'.format(
                    url_extension)
                print(url)
                driver.get(url)
                soup = BeautifulSoup(driver.page_source, 'html.parser')

                product_containers = soup.findAll('div', 'box-product')

                if not product_containers:
                    raise Exception('Empty section: ' + url)

                for container in product_containers:
                    product_url = container.find('a')['href']
                    product_urls.append(product_url + '?sc=20')

        return product_urls
示例#5
0
    def banners(cls, extra_args=None):
        base_url = 'https://www.falabella.com/falabella-cl/{}'

        sections_data = [
            [bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, ''],
            # # CATEGORY PAGES # #
            # Currently displaying a smart picker
            [
                bs.REFRIGERATION, 'Electrohogar-Refrigeradores',
                bs.SUBSECTION_TYPE_CATEGORY_PAGE,
                'category/cat3205/Refrigeradores'
            ],
            [
                bs.WASHING_MACHINES, 'Electrohogar-Lavado',
                bs.SUBSECTION_TYPE_CATEGORY_PAGE, 'category/cat3136/Lavado'
            ],
            [
                bs.TELEVISIONS, 'TV', bs.SUBSECTION_TYPE_CATEGORY_PAGE,
                'category/cat1012/TV'
            ],
            [
                bs.AUDIO, 'Audio', bs.SUBSECTION_TYPE_CATEGORY_PAGE,
                'category/cat2005/Audio'
            ],
            [
                bs.CELLS, 'Telefonía-Celulares y Teléfonos',
                bs.SUBSECTION_TYPE_CATEGORY_PAGE,
                'category/cat2018/Celulares-y-Telefonos'
            ],

            # # MOSAICS ##
            [
                bs.LINEA_BLANCA_FALABELLA, 'Electro y Tecnología-Línea Blanca',
                bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat7090035/Linea-Blanca?isPLP=1'
            ],
            [
                bs.REFRIGERATION, 'Refrigeradores-No Frost',
                bs.SUBSECTION_TYPE_MOSAIC, 'category/cat4074/No-Frost'
            ],
            [
                bs.REFRIGERATION, 'Refrigeradores-Side by Side',
                bs.SUBSECTION_TYPE_MOSAIC, 'category/cat4091/Side-by-Side'
            ],
            # [bs.WASHING_MACHINES, 'Lavadoras', bs.SUBSECTION_TYPE_MOSAIC,
            #  'category/cat3136/Lavadoras '],
            [
                bs.WASHING_MACHINES, 'Lavadoras-Lavadoras',
                bs.SUBSECTION_TYPE_MOSAIC, 'category/cat4060/Lavadoras'
            ],
            [
                bs.WASHING_MACHINES, 'Lavadoras-Lavadoras-Secadoras',
                bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat1700002/Lavadoras-Secadoras'
            ],
            [
                bs.WASHING_MACHINES, 'Lavadoras-Secadoras',
                bs.SUBSECTION_TYPE_MOSAIC, 'category/cat4088/Secadoras'
            ],
            [
                bs.WASHING_MACHINES, ' Lavadoras-Lavadoras Doble Carga',
                bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat11400002/Lavadoras-Doble-Carga'
            ],
            [
                bs.TELEVISIONS, 'Tecnología-TV', bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat1012/TV?isPLP=1'
            ],
            [
                bs.TELEVISIONS, 'Televisores LED', bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat7190148/Televisores-LED'
            ],
            [
                bs.TELEVISIONS, 'LEDs menores a 50 pulgadas',
                bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat11161614/LEDs-menores-a-50-pulgadas'
            ],
            [
                bs.TELEVISIONS, 'LEDs entre 50 - 55 pulgadas',
                bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat11161675/LEDs-entre-50---55-pulgadas'
            ],
            [
                bs.TELEVISIONS, 'LEDs sobre 55 pulgadas',
                bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat11161679/LEDs-sobre-55-pulgadas'
            ],

            # [bs.TELEVISIONS, 'TV-LED', bs.SUBSECTION_TYPE_MOSAIC,
            #  'category/cat2850014/LED'],
            # [bs.TELEVISIONS, 'TV-Smart TV', bs.SUBSECTION_TYPE_MOSAIC,
            #  'category/cat3040054/Smart-TV'],
            # [bs.TELEVISIONS, 'TV-4K UHD', bs.SUBSECTION_TYPE_MOSAIC,
            #  'category/cat3990038/4K-UHD'],
            # [bs.TELEVISIONS, 'TV-Televisores OLED',
            # bs.SUBSECTION_TYPE_MOSAIC,
            #  'category/cat2850016/Televisores-OLED'],
            # [bs.TELEVISIONS, 'TV-Pulgadas Altas',
            #  bs.SUBSECTION_TYPE_MOSAIC,
            #  'category/cat12910024/Televisores-LED-Desde-65"'],
            [
                bs.AUDIO, 'Audio-Soundbar y Home Theater',
                bs.SUBSECTION_TYPE_MOSAIC, 'category/cat2045/Home-Theater'
            ],
            [
                bs.AUDIO, 'Home Theater', bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat3050040/Home-Theater'
            ],
            [
                bs.AUDIO, 'Soundbar', bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat1700004/Soundbar'
            ],
            [
                bs.AUDIO, 'Minicomponente', bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat70018/Minicomponente'
            ],
            [
                bs.AUDIO, 'Audio-Equipos de Música y Karaokes',
                bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat3091/?mkid=CA_P2_MIO1_024794'
            ],
            [
                bs.AUDIO, 'Audio-Hi-Fi', bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat3203/Hi-Fi'
            ],
            [
                bs.AUDIO, 'Audio', bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat2005/Audio?isPLP=1'
            ],
            [
                bs.CELLS, 'Smartphones', bs.SUBSECTION_TYPE_MOSAIC,
                'category/cat720161/Smartphones'
            ],
            [
                bs.CELLS, 'Electro y Tecnología-Teléfonos',
                bs.SUBSECTION_TYPE_MOSAIC, 'category/cat2018/Telefonos?isPLP=1'
            ],
        ]

        banners = []

        proxy = None
        if 'proxy' in extra_args:
            proxy = extra_args['proxy']

        for section, subsection, subsection_type, url_suffix in sections_data:
            url = base_url.format(url_suffix)

            if subsection_type == bs.SUBSECTION_TYPE_HOME:
                with HeadlessChrome(images_enabled=True,
                                    proxy=proxy) as driver:
                    driver.set_window_size(1920, 1080)
                    driver.get(url)

                    images = driver\
                        .find_element_by_class_name('swiper-container')\
                        .find_elements_by_class_name('dy_unit')[1:-1]

                    index = 1

                    for image_url in images:
                        picture_array = image_url.find_element_by_tag_name(
                            'picture').find_elements_by_tag_name('source')
                        destination_urls = [
                            d.get_property('href')
                            for d in image_url.find_elements_by_tag_name('a')
                        ]
                        destination_urls = list(set(destination_urls))
                        for picture in picture_array:
                            picture_url = picture.get_property('srcset').split(
                                ' ')[0]

                            if 'https://www.falabella.com' not in picture_url:
                                picture_url = 'https://www.falabella.com' \
                                              '{}'.format(picture_url)

                            if picture_url:
                                banners.append({
                                    'url': url,
                                    'picture_url': picture_url,
                                    'destination_urls': destination_urls,
                                    'key': picture_url,
                                    'position': index,
                                    'section': section,
                                    'subsection': subsection,
                                    'type': subsection_type
                                })
                                break
                        else:
                            raise Exception(
                                'No valid banners found for {} in position '
                                '{}'.format(url, index + 1))
                        index += 1
            elif subsection_type == bs.SUBSECTION_TYPE_CATEGORY_PAGE:
                with HeadlessChrome(images_enabled=True,
                                    proxy=proxy,
                                    timeout=99) as driver:
                    driver.set_window_size(1920, 1080)
                    driver.get(url)

                    pictures = []

                    try:
                        pips_container = driver.find_element_by_class_name(
                            'fb-hero-carousel__pips')

                        driver.execute_script(
                            "arguments[0].setAttribute('style', "
                            "'display:block !important;');", pips_container)

                        elements = driver.find_element_by_class_name(
                            'fb-hero-carousel__pips')\
                            .find_elements_by_class_name(
                            'fb-hero-carousel__pips__pip')

                        for element in elements:
                            element.click()
                            time.sleep(2)
                            image_url = Image.open(
                                BytesIO(driver.get_screenshot_as_png()))
                            image_url = image_url.crop((0, 187, 1920, 769))
                            buffered = BytesIO()
                            image_url.save(buffered, format='PNG')
                            pictures.append(
                                base64.b64encode(buffered.getvalue()))
                    except NoSuchElementException:
                        image_url = Image.open(
                            BytesIO(driver.get_screenshot_as_png()))
                        image_url = image_url.crop((0, 187, 1920, 769))
                        buffered = BytesIO()
                        image_url.save(buffered, format='PNG')
                        pictures.append(base64.b64encode(buffered.getvalue()))

                    soup = BeautifulSoup(driver.page_source, 'html.parser')
                    images_div = soup.findAll('div', 'fb-hero-carousel-slide')
                    images_article = soup.findAll('article',
                                                  'fb-hero-carousel-slide')
                    images_module = soup.findAll('div',
                                                 'hero fb-module-wrapper')

                    images = images_div + images_article + images_module

                    assert len(images) == len(pictures)

                    for index, image_url in enumerate(images):
                        picture_array = image_url.findAll(
                            'picture')[-1].findAll('source')
                        destination_urls = [
                            d['href'] for d in image_url.findAll('a')
                        ]
                        destination_urls = list(set(destination_urls))

                        for picture in picture_array:
                            key = picture['srcset'].split(' ')[0]

                            if 'https' not in key:
                                key = 'https://www.falabella.com' + key

                            if 'webp' not in key:
                                banners.append({
                                    'url': url,
                                    'picture': pictures[index],
                                    'destination_urls': destination_urls,
                                    'key': key,
                                    'position': index + 1,
                                    'section': section,
                                    'subsection': subsection,
                                    'type': subsection_type
                                })
                                break
                        else:
                            raise Exception(
                                'No valid banners found for {} in position '
                                '{}'.format(url, index + 1))
            elif subsection_type == bs.SUBSECTION_TYPE_MOSAIC:
                session = session_with_proxy(extra_args)
                session.headers['user-agent'] = 'curl/7.64.1'
                soup = BeautifulSoup(session.get(url).text, 'html.parser')

                banner = soup.find('div', 'fb-huincha-main-wrap')

                if not banner:
                    continue

                image_url = banner.find('source')['srcset']
                dest_url = banner.find('a')['href']

                banners.append({
                    'url': url,
                    'picture_url': image_url,
                    'destination_urls': [dest_url],
                    'key': image_url,
                    'position': 1,
                    'section': section,
                    'subsection': subsection,
                    'type': subsection_type
                })

        return banners
示例#6
0
    def banners(cls, extra_args=None):
        base_url = 'https://www.abcdin.cl/{}'

        sections_data = [
            [bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, ''],
            [
                bs.LINEA_BLANCA_ABCDIN, 'Línea Blanca AbcDin',
                bs.SUBSECTION_TYPE_CATEGORY_PAGE,
                'tienda/es/abcdin/linea-blanca'
            ],
            [
                bs.TELEVISIONS, 'Electro', bs.SUBSECTION_TYPE_CATEGORY_PAGE,
                'tienda/es/abcdin/tv-audio'
            ],
            [
                bs.CELLS, 'Telefonía', bs.SUBSECTION_TYPE_CATEGORY_PAGE,
                'tienda/es/abcdin/celulares'
            ],
            [
                bs.REFRIGERATION, 'Refrigeradores', bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/linea-blanca/refrigeradores'
            ],
            [
                bs.REFRIGERATION, 'Refrigeradores No Frost',
                bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/linea-blanca/refrigeradores/'
                'refrigeradores-no-frost'
            ],
            [
                bs.REFRIGERATION, 'Refrigeradores Side by Side',
                bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/linea-blanca/refrigeradores/'
                'refrigeradores-side-by-side'
            ],
            [
                bs.WASHING_MACHINES, 'Lavado y Secado',
                bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/linea-blanca/lavado-secado'
            ],
            [
                bs.WASHING_MACHINES, 'Lavadoras', bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/linea-blanca/lavado-secado/lavadoras'
            ],
            [
                bs.WASHING_MACHINES, 'Lavadoras-Secadoras',
                bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/linea-blanca/lavado-secado/'
                'lavadoras-secadoras'
            ],
            [
                bs.TELEVISIONS, 'Electro', bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/tv-audio'
            ],
            [
                bs.TELEVISIONS, 'Televisores LED', bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/tv-audio/televisores-video/televisores-led'
            ],
            [
                bs.AUDIO, 'Audio', bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/tv-audio/audio'
            ],
            [
                bs.AUDIO, 'Minicomponentes', bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/tv-audio/audio/minicomponentes'
            ],
            [
                bs.AUDIO, 'Home Theater', bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/tv-audio/audio/home-theater'
            ],
            [
                bs.CELLS, 'Smartphones', bs.SUBSECTION_TYPE_MOSAIC,
                'tienda/es/abcdin/celulares/smartphones'
            ]
        ]

        session = session_with_proxy(extra_args)
        banners = []

        for section, subsection, subsection_type, url_suffix in sections_data:
            url = base_url.format(url_suffix)
            print(url)

            if subsection_type == bs.SUBSECTION_TYPE_HOME:
                with HeadlessChrome(images_enabled=True) as driver:
                    driver.set_window_size(1920, 1080)
                    driver.get(url)

                    time.sleep(10)

                    elements = driver.find_elements_by_class_name(
                        'slide-static')

                    controls = driver\
                        .find_element_by_class_name('pageControl')\
                        .find_elements_by_tag_name('a')

                    assert len(elements) == len(controls)

                    for index, element in enumerate(elements):
                        modal_button = driver \
                            .find_elements_by_class_name('close-modal')

                        if modal_button:
                            modal_button[0].click()
                            time.sleep(2)

                        control = controls[index]
                        control.click()
                        time.sleep(2)
                        picture = element.screenshot_as_base64
                        key_container = element\
                            .value_of_css_property('background-image')

                        key = re.search(r'url\("(.*?)"\)', key_container)\
                            .group(1)

                        try:
                            destination_urls = [
                                element.find_element_by_tag_name(
                                    'a').get_attribute('href')
                            ]
                        except NoSuchElementException:
                            destination_urls = []

                        banners.append({
                            'url': url,
                            'picture': picture,
                            'destination_urls': destination_urls,
                            'key': key,
                            'position': index + 1,
                            'section': section,
                            'subsection': subsection,
                            'type': subsection_type
                        })

            elif subsection_type == bs.SUBSECTION_TYPE_CATEGORY_PAGE:
                # STATIC BANNER
                response = session.get(url)
                soup = BeautifulSoup(response.text, 'html.parser')

                banner = soup.find('a', {'data-type': 'huincha'})
                if banner:
                    picture_url = banner.find('img')['src']
                    destination_urls = [
                        'https://www.abcdin.cl' + banner['href']
                    ]
                    banners.append({
                        'url': url,
                        'picture_url': picture_url,
                        'destination_urls': destination_urls,
                        'key': picture_url,
                        'position': 1,
                        'section': section,
                        'subsection': subsection,
                        'type': subsection_type
                    })

                # CAROUSEL
                elements = soup.findAll('div', 'homeHero')
                for index, element in enumerate(elements):
                    picture_url = element.find('img')['src']
                    url_suffix = element.find('a')

                    if not url_suffix:
                        destination_urls = []
                    else:
                        destination_urls = [
                            'https://www.abcdin.cl' + url_suffix['href']
                        ]
                    banners.append({
                        'url': url,
                        'picture_url': picture_url,
                        'destination_urls': destination_urls,
                        'key': picture_url,
                        'position': index + 1,
                        'section': section,
                        'subsection': subsection,
                        'type': subsection_type
                    })

            elif subsection_type == bs.SUBSECTION_TYPE_MOSAIC:
                response = session.get(url)
                soup = BeautifulSoup(response.text, 'html.parser')
                banner = soup.find('a', {'data-type': 'huincha'})
                if not banner:
                    banner = soup.find('div', 'homeHero')
                    if banner:
                        banner = banner.find('a')
                if banner:
                    picture_url = banner.find('img')['src']
                    destination_urls = [
                        'https://www.abcdin.cl' + banner['href']
                    ]
                    banners.append({
                        'url': url,
                        'picture_url': picture_url,
                        'destination_urls': destination_urls,
                        'key': picture_url,
                        'position': 1,
                        'section': section,
                        'subsection': subsection,
                        'type': subsection_type
                    })

        return banners
示例#7
0
    def get_owl_banners(cls, url, section, subsection, subsection_type,
                        extra_args):
        extra_args = extra_args or {}
        proxy = extra_args.get('proxy', None)
        with HeadlessChrome(images_enabled=True, timeout=240,
                            proxy=proxy, headless=True) as driver:
            banners = []
            driver.set_window_size(1920, 1080)
            # Open the page first so that the CF cookies can be loaded in
            # this domain
            # Then set the sesion cookies
            if 'cf_clearance' in extra_args:
                driver.get(url)
                load_driver_cf_cookies(driver, extra_args, '.ripley.cl')
                driver.get(url)
            else:
                driver.get(url)

            driver.execute_script("scrollTo(0, 0);")

            pictures = []

            banner_container = driver \
                .find_element_by_class_name('owl-carousel')

            retries = 10

            for i in range(retries):
                print('Retry {} for owl banner'.format(i + 1))
                time.sleep(10)
                controls = banner_container \
                    .find_elements_by_class_name('owl-page')
                if controls:
                    break
            else:
                raise Exception('Timeout waiting for owl banners: ' + url)

            for control in controls:
                control.click()
                time.sleep(1)
                pictures.append(
                    banner_container.screenshot_as_base64)

            images = banner_container.find_elements_by_class_name('owl-item')

            assert len(images) == len(pictures)

            for index, image in enumerate(images):
                try:
                    image_style = image.find_element_by_tag_name(
                        'span').get_attribute('style')
                    key = re.search(r'url\((.*?)\)', image_style) \
                        .group(1)
                except NoSuchElementException:
                    key = image.find_element_by_tag_name(
                        'source').get_attribute('srcset')

                destinations = image.find_elements_by_tag_name('a')
                destination_urls = [a.get_attribute('href')
                                    for a in destinations]
                destination_urls = list(set(destination_urls))

                destination_urls = list(set(destination_urls))

                banners.append({
                    'url': url,
                    'picture': pictures[index],
                    'destination_urls': destination_urls,
                    'key': key,
                    'position': index + 1,
                    'section': section,
                    'subsection': subsection,
                    'type': subsection_type
                })

            return banners
示例#8
0
    def preflight(cls, extra_args=None):
        # Obtain Cloudflare bypass cookie
        if extra_args is None:
            raise Exception("extra_args should contain the parameters to "
                            "obtain the Cloudflare session cookie or the "
                            "'debug' flag if testing locally")
        if 'PROXY_USERNAME' not in extra_args:
            return {}

        proxy = 'http://{}:{}@{}:{}'.format(
            extra_args['PROXY_USERNAME'],
            extra_args['PROXY_PASSWORD'],
            extra_args['PROXY_IP'],
            extra_args['PROXY_PORT'],
        )
        with HeadlessChrome(images_enabled=False, proxy=proxy,
                            headless=True) as driver:
            driver.get('https://simple.ripley.cl')
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            hcaptcha_script_tag = soup.find('script', {'data-type': 'normal'})
            website_key = hcaptcha_script_tag['data-sitekey']

            # Anti captcha request
            request_body = {
                "clientKey": extra_args['KEY'],
                "task":
                    {
                        "type": "HCaptchaTask",
                        "websiteURL": "https://simple.ripley.cl/",
                        "websiteKey": website_key,
                        "proxyType": "http",
                        "proxyAddress": extra_args['PROXY_IP'],
                        "proxyPort": extra_args['PROXY_PORT'],
                        "proxyLogin": extra_args['PROXY_USERNAME'],
                        "proxyPassword": extra_args['PROXY_PASSWORD'],
                        "userAgent": CF_REQUEST_HEADERS['User-Agent']
                    }
            }
            print('Sending anti-captcha task')
            print(json.dumps(request_body, indent=2))
            anticaptcha_session = requests.Session()
            anticaptcha_session.headers['Content-Type'] = 'application/json'
            response = json.loads(anticaptcha_session.post(
                'http://api.anti-captcha.com/createTask',
                json=request_body).text)

            print('Anti-captcha task request response')
            print(json.dumps(response, indent=2))

            assert response['errorId'] == 0

            task_id = response['taskId']
            print('TaskId', task_id)

            # Wait until the task is ready...
            get_task_result_params = {
                "clientKey": extra_args['KEY'],
                "taskId": task_id
            }
            print(
                'Querying for anti-captcha response (wait 10 secs per retry)')
            print(json.dumps(get_task_result_params, indent=4))
            retries = 1
            hcaptcha_response = None
            while not hcaptcha_response:
                if retries > 60:
                    raise Exception('Failed to get a token in time')
                print('Retry #{}'.format(retries))
                time.sleep(10)
                res = json.loads(anticaptcha_session.post(
                    'https://api.anti-captcha.com/getTaskResult',
                    json=get_task_result_params).text)

                assert res['errorId'] == 0, res
                assert res['status'] in ['processing', 'ready'], res
                if res['status'] == 'ready':
                    print('Solution found')
                    hcaptcha_response = res['solution']['gRecaptchaResponse']
                    break
                retries += 1

            print(hcaptcha_response)
            for field in ['g-recaptcha-response', 'h-captcha-response']:
                driver.execute_script("document.querySelector('[name=\""
                                      "{0}\"]').remove(); "
                                      "var foo = document.createElement('"
                                      "input'); foo.setAttribute('name', "
                                      "'{0}'); "
                                      "foo.setAttribute('value','{1}'); "
                                      "document.getElementsByTagName('form')"
                                      "[0].appendChild(foo);".format(
                                        field, hcaptcha_response))
            driver.execute_script("document.getElementsByTagName('form')"
                                  "[0].submit()")

            d = {
                "proxy": proxy,
                "cf_clearance": driver.get_cookie('cf_clearance')['value'],
                "__cfduid": driver.get_cookie('__cfduid')['value']
            }
            return d
示例#9
0
    def get_owl_banners(cls, url, section, subsection, type, extra_args):
        with HeadlessChrome(images_enabled=True, timeout=60,
                            proxy=extra_args['proxy']) as driver:
            print(url)
            banners = []
            driver.set_window_size(1920, 1080)
            driver.set_page_load_timeout(240)
            # Open the page first so that the CF cookies can be loaded in
            # this domain
            driver.get(url)
            # Then set the sesion cookies
            load_driver_cf_cookies(driver, extra_args, '.ripley.cl')
            # Then re-open the page
            driver.get(url)
            driver.execute_script("scrollTo(0, 0);")

            pictures = []

            banner_container = driver \
                .find_element_by_class_name('owl-carousel')

            controls = banner_container \
                .find_elements_by_class_name('owl-page')

            for control in controls:
                control.click()
                time.sleep(1)
                pictures.append(
                    banner_container.screenshot_as_base64)

            images = banner_container.find_elements_by_class_name('owl-item')

            assert len(images) == len(pictures)

            for index, image in enumerate(images):
                try:
                    image_style = image.find_element_by_tag_name(
                        'span').get_attribute('style')
                    key = re.search(r'url\((.*?)\)', image_style) \
                        .group(1)
                except NoSuchElementException:
                    key = image.find_element_by_tag_name(
                        'source').get_attribute('srcset')

                destinations = image.find_elements_by_tag_name('a')
                destination_urls = [a.get_attribute('href')
                                    for a in destinations]
                destination_urls = list(set(destination_urls))

                destination_urls = list(set(destination_urls))

                banners.append({
                    'url': url,
                    'picture': pictures[index],
                    'destination_urls': destination_urls,
                    'key': key,
                    'position': index + 1,
                    'section': section,
                    'subsection': subsection,
                    'type': type
                })

            return banners
示例#10
0
    def banners(cls, extra_args=None):
        sections_data = [[
            bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, 'https://www.lapolar.cl/'
        ]]

        session = session_with_proxy(extra_args)
        banners = []

        for section, subsection, subsection_type, url in sections_data:
            response = session.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            if subsection_type == bs.SUBSECTION_TYPE_HOME:
                with HeadlessChrome(images_enabled=True) as driver:
                    driver.set_window_size(1920, 1080)
                    driver.get(url)

                    pictures = []
                    banner_container = driver.find_element_by_class_name(
                        'slick-list')

                    controls = driver\
                        .find_element_by_class_name('slick-dots')\
                        .find_elements_by_tag_name('li')

                    for control in controls:
                        control.click()
                        time.sleep(2)
                        pictures.append(banner_container.screenshot_as_base64)

                    soup = BeautifulSoup(driver.page_source, 'html.parser')

                    images = soup.find('div', 'slick-track')\
                        .findAll('div', 'slick-slide')

                    images = [
                        a for a in images if 'slick-cloned' not in a['class']
                    ]

                    assert len(images) == len(pictures)

                    for index, image in enumerate(images):
                        key = None
                        key_options = image.findAll('img', 'responsive_prod')

                        destination_urls = [
                            d['href'] for d in image.findAll('a')
                        ]
                        destination_urls = list(set(destination_urls))

                        for key_option in key_options:
                            if 'llamado_logo_img' in key_option['class']:
                                continue
                            key = key_option['src']
                            break

                        if not key:
                            key = destination_urls[0]

                        banners.append({
                            'url': url,
                            'picture': pictures[index],
                            'destination_urls': destination_urls,
                            'key': key,
                            'position': index + 1,
                            'section': section,
                            'subsection': subsection,
                            'type': subsection_type
                        })
            elif subsection_type == bs.SUBSECTION_TYPE_CATEGORY_PAGE:
                iframe = soup.find('iframe', 'full')
                if iframe:
                    content = session.get(iframe['src'])
                    soup = BeautifulSoup(content.text, 'html.parser')
                    picture_base_url = 'https://www.lapolar.cl{}'
                else:
                    picture_base_url = url + '{}'

                images = soup.findAll('div', 'swiper-slide')

                if not images:
                    images = soup.findAll('div', 'item')

                for index, image, in enumerate(images):
                    picture = image.find('picture')
                    if not picture:
                        picture_url = picture_base_url.format(
                            image.find('img')['src'])
                    else:
                        picture_url = picture_base_url.format(
                            image.findAll('source')[-1]['srcset'])
                    destination_urls = [image.find('a')['href']]

                    banners.append({
                        'url': url,
                        'picture_url': picture_url,
                        'destination_urls': destination_urls,
                        'key': picture_url,
                        'position': index + 1,
                        'section': section,
                        'subsection': subsection,
                        'type': subsection_type
                    })
        return banners
示例#11
0
    def banners(cls, extra_args=None):
        base_url = 'https://www.hites.com/{}'

        sections_data = [
            [bs.HOME, 'Home', bs.SUBSECTION_TYPE_HOME, ''],
            [bs.TELEVISIONS, 'TV Video', bs.SUBSECTION_TYPE_MOSAIC,
             'tecnologia/tv-video'],
            [bs.TELEVISIONS, 'Todos los Led', bs.SUBSECTION_TYPE_MOSAIC,
             'tecnologia/tv-video/todos-los-led'],
            [bs.TELEVISIONS, 'Smart TV Hasta 50', bs.SUBSECTION_TYPE_MOSAIC,
             'tecnologia/tv-video/smart-tv-hasta-50'],
            [bs.TELEVISIONS, 'Smart TV Entre 55 y 60',
             bs.SUBSECTION_TYPE_MOSAIC,
             'tecnologia/tv-video/smart-tv-entre-55-y-60'],
            [bs.TELEVISIONS, 'Smart TV Desde 65', bs.SUBSECTION_TYPE_MOSAIC,
             'tecnologia/tv-video/smart-tv-desde-65'],
            [bs.CELLS, 'Smartphone', bs.SUBSECTION_TYPE_MOSAIC,
             'celulares/smartphone'],
            [bs.CELLS, 'Smartphone-Smartphone', bs.SUBSECTION_TYPE_MOSAIC,
             'celulares/smartphone/smartphone'],
            [bs.CELLS, 'Smartphone Liberados', bs.SUBSECTION_TYPE_MOSAIC,
             'celulares/smartphone/smartphone-liberados'],
            [bs.REFRIGERATION, 'Refrigeradores', bs.SUBSECTION_TYPE_MOSAIC,
             'electro-hogar/refrigeradores'],
            [bs.REFRIGERATION, 'No Frost', bs.SUBSECTION_TYPE_MOSAIC,
             'electro-hogar/refrigeradores/no-frost'],
            [bs.REFRIGERATION, 'Side by Side', bs.SUBSECTION_TYPE_MOSAIC,
             'electro-hogar/refrigeradores/side-by-side'],
            [bs.WASHING_MACHINES, 'Lavado y Secado', bs.SUBSECTION_TYPE_MOSAIC,
             'electro-hogar/lavado-y-secado'],
            [bs.WASHING_MACHINES, 'Lavadoras', bs.SUBSECTION_TYPE_MOSAIC,
             'electro-hogar/lavado-y-secado/lavadoras'],
            [bs.WASHING_MACHINES, 'Lavadoras-Secadoras',
             bs.SUBSECTION_TYPE_MOSAIC,
             'electro-hogar/lavado-y-secado/lavadoras-secadoras'],
            # [bs.WASHING_MACHINES, 'Secadoras', bs.SUBSECTION_TYPE_MOSAIC,
            #  'electro-hogar/lavado-y-secado/secadoras'],
            [bs.AUDIO, 'Audio', bs.SUBSECTION_TYPE_MOSAIC, 'tecnologia/audio'],
            # [bs.AUDIO, 'Minicomponentes', bs.SUBSECTION_TYPE_MOSAIC,
            #  'tecnologia/audio/minicomponentes'],
            # [bs.AUDIO, 'Soundbar y Home Theater', bs.SUBSECTION_TYPE_MOSAIC,
            #  'tecnologia/audio/soundbar-y-home-theater']
        ]

        session = session_with_proxy(extra_args)
        banners = []

        for section, subsection, subsection_type, url_suffix in sections_data:
            url = base_url.format(url_suffix)
            print(url)

            if subsection_type == bs.SUBSECTION_TYPE_HOME:
                with HeadlessChrome(images_enabled=True,
                                    timeout=120) as driver:
                    driver.set_window_size(1920, 1080)
                    driver.get(url)

                    pictures = []

                    banner_container = driver\
                        .find_element_by_class_name('slick-list')

                    # banner_container = driver \
                    #     .find_element_by_class_name('owl-stage-outer')

                    controls = driver.find_element_by_class_name(
                        'slick-dots')\
                        .find_elements_by_tag_name('li')

                    # controls = driver.find_elements_by_class_name('owl-dot')

                    for control in controls:
                        control.click()
                        time.sleep(1)
                        pictures.append(
                            banner_container.screenshot_as_base64)

                    soup = BeautifulSoup(driver.page_source, 'html.parser')

                    images = soup.find('div', 'slick-track')\
                        .findAll('div', 'slick-slide')

                    # images = soup.find('div', 'owl-stage') \
                    #     .findAll('div', 'owl-item')

                    images = [a for a in images if
                              'slick-cloned' not in a['class']]

                    # images = [a for a in images if
                    #           'cloned' not in a['class']]

                    assert len(images) == len(pictures)

                    for index, image in enumerate(images):
                        product_box = image.find('div', 'boxproductos')

                        if not product_box:
                            product_box = image.find('div', 'box-producto')

                        if not product_box:
                            product_box = image.find('div', 'box-foto')

                        if not product_box:
                            product_box = image.find(
                                'div', 'slide-new__products')

                        if not product_box:
                            product_box = image.find('div', 'images_llamados')

                        if not product_box:
                            product_box = image.find(
                                'div', 'products-item__img')

                        if not product_box:
                            product_box = image.find('a', 'boxproducto')

                        if not product_box:
                            product_box = image

                        if not (product_box.find('source') or
                                product_box.find('img')):
                            product_box = image.find('div', 'img_boxproducto')

                        if not product_box:
                            product_box = image.find('div', 'logocampana')

                        key_container = product_box.find('source')

                        if key_container:
                            key = key_container['srcset']
                        else:
                            key = product_box.find('img')['src']

                        destinations = [d for d in image.findAll('a')]
                        destination_urls = []

                        for destination in destinations:
                            if destination.get('href'):
                                destination_urls.append(destination['href'])

                        destination_urls = list(set(destination_urls))

                        banners.append({
                            'url': url,
                            'picture': pictures[index],
                            'destination_urls': destination_urls,
                            'key': key,
                            'position': index + 1,
                            'section': section,
                            'subsection': subsection,
                            'type': subsection_type
                        })
            elif subsection_type == bs.SUBSECTION_TYPE_MOSAIC:
                response = session.get(url)
                soup = BeautifulSoup(response.text, 'html.parser')

                banners_container = soup.find('section')\
                    .findAll('div', 'espot', recursive=False)

                for index, banner in enumerate(banners_container):
                    destination_urls = [d['href'] for d in
                                        banner.findAll('a')]

                    destination_urls = list(set(destination_urls))

                    picture_container = banner.find('picture')

                    if picture_container:
                        picture_source = picture_container.find('source')

                        if not picture_source:
                            continue

                        picture_url = picture_source['srcset']
                        banners.append({
                            'url': url,
                            'picture_url': picture_url,
                            'destination_urls': destination_urls,
                            'key': picture_url,
                            'position': index + 1,
                            'section': section,
                            'subsection': subsection,
                            'type': subsection_type
                        })
                    else:
                        with HeadlessChrome(images_enabled=True, timeout=120) \
                                as driver:
                            driver.set_window_size(1920, 1080)
                            driver.get(url)

                            s_banner = driver.find_elements_by_css_selector(
                                '#main>.espot')[index]

                            key_container = banner.find('img')

                            if not key_container or \
                                    s_banner.size['height'] == 0:
                                continue

                            key = key_container['src']

                            picture = s_banner.screenshot_as_base64
                            banners.append({
                                'url': url,
                                'picture': picture,
                                'destination_urls': destination_urls,
                                'key': key,
                                'position': index + 1,
                                'section': section,
                                'subsection': subsection,
                                'type': subsection_type
                            })

        return banners
示例#12
0
    def _session_driver(cls, extra_args):
        time.sleep(1)
        # Browser initialization
        driver = HeadlessChrome(headless=True).driver
        driver.get('https://cl.ingrammicro.com/_layouts/'
                   'CommerceServer/IM/Login.aspx')

        retries = 1

        while retries < 5:
            if driver.find_elements_by_id('okta-signin-username'):
                break
            time.sleep(1)
            retries += 1

        driver.find_element_by_id('okta-signin-username').send_keys(
            extra_args['username'])
        driver.find_element_by_id('okta-signin-password').send_keys(
            extra_args['password'])
        driver.find_element_by_id('okta-signin-submit').click()

        time.sleep(10)

        return driver