예제 #1
0
    def crawl_listing(self, url, ctx='', **kwargs):
        if url.startswith('http://blogs.nordstrom.com'):
            return
        try:
            res = requests.get(url, params={'sort': 'sale'})
        except requests.exceptions.ConnectionError:
            return

        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)
        listing_node = tree.cssselect('div.fashion-results')

        if listing_node:
            listing_node = listing_node[0]
        else:
            if tree.cssselect('div#brandsIndex'):
                return

            self.crawl_listing_of_no_leaf(tree, ctx=ctx, **kwargs)
            return

        product_nodes = listing_node.cssselect('div.row div.fashion-item')
        if not product_nodes:
            self.crawl_listing_of_no_leaf(tree, ctx=ctx, **kwargs)
            return
        
        category = Category.objects(key=kwargs.get('key')).first()
        no_discount_num = 0 # sometimes no discount product occurs between the  discount ones ordered by sale.
        for product_node in product_nodes:
            key = product_node.get('id')
            if not key:
                common_failed.send(sender=ctx, url=url, reason='listing product has no id')
                continue

            try:
                info_node = product_node.cssselect('div.info')[0]
                a_node = info_node.cssselect('a')[0]
                title = a_node.text.strip()

                price = None; listprice = None
                price_nodes = info_node.cssselect(".price")
                for price_node in price_nodes:
                    if 'regular' in price_node.get('class'):
                        listprice = price_node.text
                    elif 'sale' in price_node.get('class'):
                        price = price_node.text
                
                if price is None or listprice is None:
                    no_discount_num += 1
                    if no_discount_num < 3:
                        continue
                    # common_failed.send(sender=ctx, url=url, \
                    #     reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice))
                    return

                combine_url = a_node.get('href')
                if not combine_url:
                    common_failed.send(sender=ctx, url=url, reason='listing product %s.%s cannot crawl combine_url' % (key, title))
                    continue

                match = re.search(r'https?://.+', combine_url)
                if not match:
                    combine_url = 'http://shop.nordstrom.com%s' % (combine_url)

            except IndexError:
                print traceback.format_exc()
                common_failed.send(sender=ctx, url=url, reason='listing product %s -> %s' % (key, traceback.format_exc()))
                continue


            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True

            if title and title != product.title:
                product.title = title
                is_updated = True

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            if category.cats and set(category.cats).difference(product.dept):
                product.dept = list(set(category.cats) | set(product.dept or []))
                is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            if is_updated:
                product.list_update_time = datetime.utcnow()
            
            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            selected = Picker(site='nordstrom').pick(product)
            if not selected:
                continue

            product.hit_time = datetime.utcnow()
            product.save()
            
            # print product.title
            # print product.combine_url
            # print product.listprice
            # print product.price
            # print is_new
            # print is_updated
            # print

            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated) )

        # Go to the next page to keep on crawling.
        try:
            arrow_node = tree.cssselect('div.fashion-results-header div.fashion-results-pager ul.arrows li.next')[0]
        except IndexError:
            common_failed.send(sender=ctx, url=url, reason=traceback.format_exc())
            return
        next_page = arrow_node.cssselect('a')[0].get('href') \
            if 'disabled' not in arrow_node.get('class') else None

        if next_page:
            print next_page
            self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
예제 #2
0
    def crawl_listing(self, url, ctx='', **kwargs):
        res = requests.get(url)
        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)

        category = Category.objects(key=kwargs.get('key')).first()
        if not category:
            common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key'))
            return

        product_nodes = tree.cssselect('div#searchResults a')
        for product_node in product_nodes:
            price = None; listprice = None
            price = product_node.cssselect('.price-6pm')[0].text
            listprice_node = product_node.cssselect('.discount')
            listprice = ''.join(listprice_node[0].xpath('text()')) if listprice_node else None

            # eliminate products of no discountIndexError:
            if price is None or listprice is None:
                # common_failed.send(sender=ctx, url=url, \
                #     reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice))
                continue

            key = product_node.get('data-product-id')
            if not key:
                common_failed.send(sender=ctx, url=url, reason='listing product has no key')
                continue

            combine_url = product_node.get('href')
            key = '%s_%s' % (key, combine_url.split('/')[-1])
            match = re.search(r'https?://.+', combine_url)
            if not match:
                combine_url = '%s%s' % (HOST, combine_url)

            brand = product_node.cssselect('.brandName')[0].text.strip()
            title = product_node.cssselect('.productName')[0].text.strip()

            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if title and title != product.title:
                product.title = title
                is_updated = True

            if brand and brand != product.brand:
                product.brand = brand
                is_updated = True

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            if category.cats and set(category.cats).difference(product.dept):
                product.dept = list(set(category.cats) | set(product.dept or []))
                is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            if is_updated:
                product.list_update_time = datetime.utcnow()
            
            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            selected = Picker(site='6pm').pick(product)
            if not selected:
                continue

            product.hit_time = datetime.utcnow()
            product.save()

            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated) )


            print product.key; print product.brand; print product.title; \
            print product.price, ' / ', product.listprice; print product.combine_url; \
            print product.dept; print

        # Go to the next page to keep on crawling.
        next_page = None
        page_node = tree.cssselect('div.pagination')
        if not page_node:
            return

        last_node =page_node[0].cssselect('.last')
        if last_node:
            next_page = page_node[0].cssselect('a')[-1].get('href')

        if next_page:
            match = re.search(r'https?://.+', next_page)
            if not match:
                next_page = '%s%s' % (HOST, next_page)
            print next_page
            self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
예제 #3
0
    def crawl_listing(self, url, ctx='', **kwargs):
        res = requests.get(url, params={'Ns': 'P_sale_flag|1'})
        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)

        category = Category.objects(key=kwargs.get('key')).first()
        if not category:
            print 'Category does not exist'
            common_failed.send(sender=ctx, url=url, reason='Category does not exist -> {0} .'.format(kwargs))
            return

        product_nodes = tree.cssselect('div#product-container div');
        no_discount_num = 0 # sometimes no discount product occurs between the  discount ones ordered by sale.

        for product_node in product_nodes:
            if not product_node.get('id') or 'product' not in product_node.get('id').lower():
                continue

            key = product_node.get('id')
            info_node = product_node.cssselect('div.product-text a')[0]
            price = None; listprice = None
            listprice_node = info_node.cssselect('span.product-price')
            price_node = info_node.cssselect('span.product-sale-price')
            if listprice_node:
                listprice = ''.join(listprice_node[0].xpath('.//text()')).strip()
            if price_node:
                price = ''.join(price_node[0].xpath('.//text()')).strip()

            if price is None or listprice is None:
                no_discount_num += 1
                if no_discount_num < 3:
                    continue
                return
            no_discount_num = 0

            brand = info_node.cssselect('p span.product-designer-name')[0].text
            if brand:
                brand = brand.strip()
            title = info_node.cssselect('p.product-description')[0].text.strip()
            combine_url = info_node.get('href')

            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if title and title != product.title:
                product.title = title
                is_updated = True
                product.update_history['title'] = datetime.utcnow()

            if brand and brand != product.brand:
                product.brand = brand
                is_updated = True

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True
                product.update_history['combine_url'] = datetime.utcnow()

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            if category.cats and set(category.cats).difference(product.dept):
                product.dept = list(set(category.cats) | set(product.dept or []))
                is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            if is_updated:
                product.list_update_time = datetime.utcnow()
            
            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            selected = Picker(site='saksfifthavenue').pick(product)
            if not selected:
                continue

            product.hit_time = datetime.utcnow()
            product.save()
            
            # print product.brand; print product.title; print product.combine_url; print product.listprice, ' / ', product.price; print is_new; print is_updated
            # print

            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated) )

        # Go to the next page to keep on crawling.
        next_page = None
        page_nodes = tree.cssselect('div.pagination-container ol.pa-page-number li a')
        for page_node in page_nodes:
            if page_node.get('class') == 'next':
                href = page_node.get('href')
                match = re.search(r'https?://.+', href)
                next_page = href if match else '{0}/{1}'.format(HOST, href)
                break

        if next_page:
            print next_page
            self.crawl_listing(url=next_page, ctx=ctx, **kwargs)