def categories_list(self): links = [] for link in self._select_node(self._selectors['menu_categories']): if link and link.has_attr('href'): category, created = Category.objects.get_or_create( slug=str_slug(link.text), enterprise_id=self._enterprise.id) if not category.name: category.name = link.text.strip() if not category.url: category.url = link['href'] category.save() pagination = self._selectors['pagination'] if self._config['url'] in link['href']: new_link = link['href'] + pagination else: new_link = (self._config['url'] + link['href'] + pagination) links.append({"link": new_link, "category": category}) return links
def start_scrap(request): if request.method == 'GET': site_uid = request.GET.get('page', None) if site_uid: __config = config()['sites'][site_uid] __selectors = __config['selectors'] logger.info("beggining scraper for {}".format(__config['url'])) enterprise, created = Enterprise.objects.get_or_create( slug=str_slug(site_uid) ) if not enterprise.name: enterprise.name = site_uid if not enterprise.url: enterprise.url = __config['url'] enterprise.save() home_page = home.HomePage(site_uid, enterprise) for cat in home_page.categories_list: page_number = __selectors['init_page'] category_list = category.CategoryList(site_uid, cat["link"].replace("PAGE_NUMBER", str(page_number))) total_pages = category_list.total_pages for i in range(total_pages): url = cat["link"].replace("PAGE_NUMBER", str(page_number)) page_number = page_number + __selectors['iterator_page'] category_list.go_to_page(url) for item in category_list.products_list: product_item = product.ProductPage(site_uid, item, cat["category"]) print(product_item.product) enterprises = Enterprise.objects.all() return render(request, 'enterprises.html', {'enterprises': enterprises})
def _get_discount(self): discount = self._select_node(self._selectors['discount']) if discount: price = str_slug(discount[0].text) return int(re.search(r'\d+', price).group())
def _get_normal_price(self): normal_price = self._select_node(self._selectors['normal_price']) if normal_price: price = str_slug(normal_price[0].text) return int(re.search(r'\d+', price).group())
def _get_offer_price(self): offer_price = self._select_node(self._selectors['offer_price']) if offer_price: price = str_slug(offer_price[0].text) return int(re.search(r'\d+', price).group())
def _get_internet_price(self): internet_price = self._select_node(self._selectors['internet_price']) if internet_price: price = str_slug(internet_price[0].text) return int(re.search(r'\d+', price).group())