示例#1
0
    def categories_list(self):
        links = []
        for link in self._select_node(self._selectors['menu_categories']):
            if link and link.has_attr('href'):
                category, created = Category.objects.get_or_create(
                    slug=str_slug(link.text),
                    enterprise_id=self._enterprise.id)

                if not category.name:
                    category.name = link.text.strip()

                if not category.url:
                    category.url = link['href']

                category.save()

                pagination = self._selectors['pagination']

                if self._config['url'] in link['href']:
                    new_link = link['href'] + pagination
                else:
                    new_link = (self._config['url'] + link['href'] +
                                pagination)

                links.append({"link": new_link, "category": category})

        return links
示例#2
0
def start_scrap(request):
    if request.method == 'GET':
        site_uid = request.GET.get('page', None)
        if site_uid:
            __config = config()['sites'][site_uid]
            __selectors = __config['selectors']
            logger.info("beggining scraper for {}".format(__config['url']))

            enterprise, created = Enterprise.objects.get_or_create(
                slug=str_slug(site_uid)
            )

            if not enterprise.name:
                enterprise.name = site_uid

            if not enterprise.url:
                enterprise.url = __config['url']

            enterprise.save()

            home_page = home.HomePage(site_uid, enterprise)

            for cat in home_page.categories_list:
                page_number = __selectors['init_page']
                category_list = category.CategoryList(site_uid, cat["link"].replace("PAGE_NUMBER", str(page_number)))
                total_pages = category_list.total_pages

                for i in range(total_pages):
                    url = cat["link"].replace("PAGE_NUMBER", str(page_number))
                    page_number = page_number + __selectors['iterator_page']
                    category_list.go_to_page(url)

                    for item in category_list.products_list:
                        product_item = product.ProductPage(site_uid, item, cat["category"])
                        print(product_item.product)

    enterprises = Enterprise.objects.all()
    return render(request, 'enterprises.html', {'enterprises': enterprises})
 def _get_discount(self):
     discount = self._select_node(self._selectors['discount'])
     if discount:
         price = str_slug(discount[0].text)
         return int(re.search(r'\d+', price).group())
 def _get_normal_price(self):
     normal_price = self._select_node(self._selectors['normal_price'])
     if normal_price:
         price = str_slug(normal_price[0].text)
         return int(re.search(r'\d+', price).group())
 def _get_offer_price(self):
     offer_price = self._select_node(self._selectors['offer_price'])
     if offer_price:
         price = str_slug(offer_price[0].text)
         return int(re.search(r'\d+', price).group())
 def _get_internet_price(self):
     internet_price = self._select_node(self._selectors['internet_price'])
     if internet_price:
         price = str_slug(internet_price[0].text)
         return int(re.search(r'\d+', price).group())