예제 #1
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        title = remove_odd_space(
            soup.find('h1', class_='product-cart__title').text)
        sub_title = remove_odd_space(
            soup.find('a',
                      class_='product-cart__content-info-header-black').text)
        title += ' ' + sub_title
        parsed_product['title'] = title

        # price
        price = remove_ALL_spaces(
            soup.find('div',
                      class_='product-cart__content-price-actual').text)[:-1]
        parsed_product['price_new'] = price

        return parsed_product
예제 #2
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}")
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        # title
        title = remove_odd_space(soup.find('div', class_='range-revamp-header-section__title--big').text)
        sub_title = remove_odd_space(soup.find('span', 'range-revamp-header-section__description-text').text)
        title += ' ' + sub_title
        parsed_product['title'] = title

        # price
        price = remove_ALL_spaces(soup.find('span', class_='range-revamp-price__integer').text)
        parsed_product['price_new'] = price

        return parsed_product
예제 #3
0
    def get_proxy_list(self, port: int = 3128) -> List[str]:

        driver = get_usual_webdriver()
        page_source = load_page_with_TL(
            driver, 'https://www.proxynova.com/proxy-server-list/', 7.5)
        if page_source is None:
            # fixme - log - error - can't load web page
            print(f"can't load page for {self.get_name()}")
            return []
        ips = []
        try:
            soup = BeautifulSoup(page_source, 'html.parser')
            driver.quit()

            for item in soup.find('tbody').find_all('tr'):
                try:
                    extracted_ip = remove_odd_space(
                        item.find('abbr').text).split(';')[1]
                    extracted_port = remove_odd_space(
                        item.find_all('td')[1].text)
                    ips.append(f"{extracted_ip}:{extracted_port}")
                except:
                    pass
        finally:
            driver.quit()
            return ips
예제 #4
0
    def _get_parsed_product_from_search(
            self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['sub_type'] != 'appliances':
            return None

        parsed_product_list = []

        url = self.make_search_url(category_row['search_word'])

        print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
        print(f'using url:\n{url}')

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        for parsed_item in soup.find_all('div', class_='b-product-block'):
            try:
                parsed_product = get_empty_parsed_product_dict()

                # title
                title = remove_odd_space(
                    parsed_item.find('div',
                                     class_='b-product-block__name').text)
                try:
                    sub_title = remove_odd_space(
                        parsed_item.find('div',
                                         class_='b-product-block__type').text)
                    title = sub_title + ' ' + title
                except:
                    pass

                parsed_product['title'] = title

                # url
                url = parsed_item.find(
                    'a', class_='b-product-block__main-link')['href']
                url = fr'https://www.svyaznoy.ru{url}'
                parsed_product['url'] = url

                # price
                price = remove_non_digits(
                    parsed_item.find(
                        'span', class_='b-product-block__visible-price').text)
                parsed_product['price_new'] = price

                parsed_product_list.append(parsed_product)
            except:
                # FIXME log fatal
                print("can't parse svaznoy item")
                print(parsed_item)

        return parsed_product_list
예제 #5
0
    def _get_parsed_product_from_search(
            self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['sub_type'] != 'appliances':
            return None

        full_parsed_product_list = []

        for page_num in range(1):
            parsed_product_list = []
            # url = self.get_search_url_for_category(category_row, page_num)
            url = self.create_general_search_url(category_row['search_word'],
                                                 page_num)

            print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
            print(f'using url:\n{url}')

            page_source = self._load_page_with_TL(url)
            if page_source is None:
                # fixme - log - fatal - can't load page
                print(
                    f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
                )
                return None

            soup = BeautifulSoup(page_source, 'html.parser')

            for parsed_item in soup.find_all('li', {'data-dy': 'product'}):
                parsed_product = get_empty_parsed_product_dict()
                # title
                title = remove_odd_space(
                    parsed_item.find('a', {
                        'data-dy': 'title'
                    }).text)
                parsed_product['title'] = title

                # url
                url = remove_odd_space(
                    parsed_item.find('a', {'data-dy': 'title'})['href'])
                url = f"https://www.eldorado.ru{url}"
                parsed_product['url'] = url

                # price
                price_list = []
                for price_item in parsed_item.find_all('span'):
                    if hasattr(price_item,
                               'text') and price_item.text[-2:] == 'р.':
                        mb_price = float(remove_non_digits(price_item.text))
                        if 100 <= mb_price <= 100000:
                            price_list.append(mb_price)
                price = sorted(price_list)[-1]
                parsed_product['price_new'] = price

                parsed_product_list.append(parsed_product)
            full_parsed_product_list.extend(parsed_product_list)
        return full_parsed_product_list
예제 #6
0
    def _get_parsed_product_from_search(
            self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['sub_type'] != 'medicine':
            return None

        parsed_product_list = []

        url = self._create_search_url_for_category(category_row['search_word'])

        print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
        print(f'using url:\n{url}')

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        for parsed_item in soup.find_all('div', class_='product'):
            try:
                parsed_product = get_empty_parsed_product_dict()
                # title
                title = remove_odd_space(
                    parsed_item.find('a', class_='product__title').text)
                sub_title = remove_odd_space(
                    parsed_item.find('a', class_='product-brand__link').text)
                title += ' ' + sub_title
                parsed_product['title'] = title

                # url
                url = parsed_item.find('a', class_='product__title')['href']
                parsed_product['url'] = self._create_link_to_product(url)

                # price
                price = remove_ALL_spaces(
                    parsed_item.find(
                        'span', class_='product__active-price-number').text)
                parsed_product['price_new'] = price

                parsed_product_list.append(parsed_product)
            except:
                # FIXME log fatal
                print("can't parse rigla item")
                print(parsed_item)

        return parsed_product_list
예제 #7
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        title = remove_odd_space(soup.find('h1', class_='b-offer-title').text)
        parsed_product['title'] = title

        # price
        price = remove_non_digits(
            soup.find('div', class_='b-offer-box__price').text)
        parsed_product['price_new'] = price

        return parsed_product
예제 #8
0
    def _get_parsed_product_from_search(self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['sub_type'] != 'furniture':
            return None

        parsed_product_list = []

        url = self._create_search_url_for_category(category_row['search_word'])

        print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
        print(f'using url:\n{url}')

        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}")
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        for parsed_item in soup.find_all('div', class_='serp-grid__item'):
            try:
                parsed_product = get_empty_parsed_product_dict()

                # title
                title = remove_odd_space(parsed_item.find('div', 'range-revamp-header-section__title--small').text)
                sub_title = remove_odd_space(parsed_item.find('span', 'range-revamp-header-section__description-text').text)
                title += ' ' + sub_title
                parsed_product['title'] = title

                # url
                url = parsed_item.find('a')['href']
                parsed_product['url'] = url

                # price
                price = remove_ALL_spaces(parsed_item.find('span', class_='range-revamp-price__integer').text)
                parsed_product['price_new'] = price

                parsed_product_list.append(parsed_product)
            except:
                # FIXME log fatal
                print("can't parse IKEA item")
                print(parsed_item)

        return parsed_product_list
예제 #9
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}")
            return None

        if 'В выбранном Вами магазине данный товар не представлен' in str(page_source):
            # FIXME error log
            print('no searched item in shop')
            raise ValueError('no searched item in shop')

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        title = remove_odd_space(str(soup.find('h1', class_='sku-page__title').text))
        try:
            sub_title = remove_odd_space(soup.find('div', class_='sku-page__sub-title').text)
            title += ' ' + sub_title
        except:
            pass
        parsed_product['title'] = title

        # price
        for item in soup.find_all('div', class_='sku-prices-block__item'):
            if 'обычная' in str(item).lower():
                price = remove_odd_space(item.find('span', class_='sku-price__integer').text).replace(' ', '')
                parsed_product['price_new'] = float(price.replace(',', '.'))

        # unit
        for item in soup.find_all('div', class_='sku-card-tab-params__item'):
            if 'Упаковка' in str(item):
                unit = remove_odd_space(item.find('dd', 'sku-card-tab-params__value').text)
                parsed_product['unparsed_units'] = unit

        return parsed_product
예제 #10
0
    def _get_parsed_product_from_search(
            self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['type'] != 'food':
            return None

        parsed_product_list = []

        url = self._create_serch_url_for_category(category_row['search_word'])

        print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
        print(f'using url:\n{url}')

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        for parsed_item in soup.find_all('li', class_='xf-catalog__item'):

            if "Временно отсутствует" in str(parsed_item):
                continue

            parsed_product = get_empty_parsed_product_dict()

            # title
            title = remove_odd_space(
                parsed_item.find('a', class_='xf-product-title__link').text)
            parsed_product['title'] = title

            # url
            url = parsed_item.find('a',
                                   class_='xf-product-title__link')['href']
            url = f"https://perekrestok.ru{url}"
            parsed_product['url'] = url

            # price
            try:
                price = parsed_item.find(
                    'div', class_='xf-product-cost__old-price')['data-cost']
            except:
                price = parsed_item.find(
                    'div', class_='xf-product-cost__current')['data-cost']
            parsed_product['price_new'] = price

            parsed_product_list.append(parsed_product)

        return parsed_product_list
예제 #11
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        try:
            title = remove_odd_space(
                soup.find('h1', class_='xf-product-card__title').text)
        except:
            title = remove_odd_space(
                soup.find('h1', class_='xf-product-new__title').text)
        title = remove_odd_space(title)
        parsed_product['title'] = title

        # price
        price_new = soup.find('span', class_='js-price-rouble').text
        price_new = remove_odd_space(price_new)
        parsed_product['price_new'] = price_new

        try:
            price_old = soup.find('span', class_='js-old-price-rouble').text
            price_old = remove_odd_space(price_old)
        except:
            price_old = None
        parsed_product['price_old'] = price_old

        return parsed_product
예제 #12
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url
        # title
        title = remove_odd_space(
            soup.find('h1', class_='catalogItemDetailHd',
                      itemprop='name').text)
        parsed_product['title'] = title

        # price
        try:
            price = remove_ALL_spaces(
                soup.find('span',
                          class_='product-box-price__old-el').text)[:-2]
        except:
            price = remove_ALL_spaces(
                soup.find('div', class_='product-box-price__active').text)[:-2]
        parsed_product['price_new'] = price
        parsed_product['price_old'] = None

        # float, value in unit of unit_title
        parsed_product['unit_value'] = 1
        # string, name of units
        parsed_product['unit_title'] = '1шт'

        return parsed_product