Exemplo n.º 1
0
    def start_requests(self):
        """Generate Requests from the SEARCH_URL and the search terms."""
        for st in self.searchterms:
            yield Request(
                self.url_formatter.format(
                    self.SEARCH_URL,
                    search_term=urllib.quote_plus(st.encode('utf-8')),
                ),
                self._parse_all_cat,
                meta={'search_term': st, 'remaining': self.quantity},
            )

        if self.product_url:
            prod = SiteProductItem()
            prod['is_single_result'] = True
            prod['url'] = self.product_url
            prod['search_term'] = ''
            yield Request(self.product_url,
                          self._parse_single_product,
                          meta={'product': prod})

        if self.products_url:
            urls = self.products_url.split('||||')
            for url in urls:
                prod = SiteProductItem()
                prod['url'] = url
                prod['search_term'] = ''
                yield Request(url,
                              self._parse_single_product,
                              meta={'product': prod})
Exemplo n.º 2
0
    def _scrape_product_links(self, response):
        def full_url(url):
            return urlparse.urljoin(response.url, url)

        self.total_matched = 0

        links = response.xpath(
            "//div[@class='result-row']"
            "/article/a[@class='product-link']/@href").extract()

        if not links:
            no_results = response.xpath(
                "//div[@class='mod-important']/h1/text()").re(r'No results.*')
            if not no_results:
                # Exctract links form brand-page
                links = response.xpath(
                    "//div[@id='content']/div/div/div/section/section"
                    "/div/ul/li/a/@href").extract()

                url = full_url(links.pop(0))
                new_meta = response.meta.copy()
                new_meta['pages_wlinks'] = links
                new_meta['ranking'] = 1
                new_meta['count'] = 0
                new_meta['links'] = []
                yield Request(url, self._scrape_brand_links,
                              meta=new_meta), SiteProductItem()
                return
        if not links:
            self.log("Found no product links.", DEBUG)

        for link in links:
            yield full_url(link), SiteProductItem()
Exemplo n.º 3
0
 def _parse_single_product(self, response):
     productdata = "[" + is_empty(
         response.xpath('//meta[@name="productdata"]/@content').extract(),
         "")[:-1].replace("|", ",") + "]"
     productdata = is_empty(json.loads(productdata))
     product = SiteProductItem()
     if productdata:
         product["title"] = productdata["name"]
         product["is_out_of_stock"] = not productdata["available"]
         product["url"] = "http://www.tesco.com/groceries/product/details/"\
             "?id=" + str(productdata["productId"])
         regex = "id=([A-Z0-9\-]+)"
         reseller_id = re.findall(regex, product.get('url', ''))
         reseller_id = reseller_id[0] if reseller_id else None
         cond_set_value(product, "reseller_id", reseller_id)
         try:
             product["price"] = Price(price=productdata["price"],
                                      priceCurrency="GBP")
         except:
             pass
         product["image_url"] = productdata["mediumImage"]
         product["search_term"] = ""
         product["brand"] = is_empty(self.brand_from_title(
             product["title"]))
         product["site"] = is_empty(self.allowed_domains)
     if self.product_url:
         product['is_single_result'] = True
         if product.get("search_term"):
             del product['search_term']
     return product
Exemplo n.º 4
0
    def start_requests(self):
        for st in self.searchterms:
            form_data = self.FORM_DATA.copy()
            form_data['searchTerm'] = st
            form_data['orderBy'] = self.SORT
            self.pages[st] = 0
            # send request just to count number of total results
            yield Request(url=self.url_formatter.format(self.FIRST_URL,
                                                        search_term=quote(st)),
                          callback=self.parse_total_and_start_search,
                          meta={
                              'form_data': form_data,
                              'search_term': st,
                              'remaining': self.quantity
                          })
        if self.product_url:
            prod = SiteProductItem()
            prod['is_single_result'] = True
            prod['url'] = self.product_url
            yield Request(self.product_url,
                          self._parse_single_product,
                          meta={'product': prod})

        if self.products_url:
            urls = self.products_url.split('||||')
            for url in urls:
                prod = SiteProductItem()
                prod['url'] = url
                prod['search_term'] = ''
                yield Request(url,
                              self._parse_single_product,
                              meta={'product': prod})
Exemplo n.º 5
0
    def start_requests(self):
        for st in self.searchterms:
            url = self.url_formatter.format(
                self.SEARCH_URL,
                search_term=urllib.quote_plus(st.encode('utf-8')),
                page=''  # don't set for first request, or results will differ
            )
            self.pages[st] = 2
            yield Request(url,
                          meta=dict(search_term=st, remaining=self.quantity))

        if self.product_url:
            prod = SiteProductItem()
            prod['is_single_result'] = True
            prod['url'] = self.product_url
            prod['search_term'] = ''
            yield Request(self.product_url,
                          self._parse_single_product,
                          meta={'product': prod})

        if self.products_url:
            urls = self.products_url.split('||||')
            for url in urls:
                prod = SiteProductItem()
                prod['url'] = url
                prod['search_term'] = ''
                yield Request(url,
                              self._parse_single_product,
                              meta={'product': prod})
Exemplo n.º 6
0
    def _scrape_product_links(self, response):
        """
        Scraping product links from search page
        """

        items = response.xpath(
            '//ul[@id="prod-list"]/li[contains(@class, "product-list-item")]')

        if items:
            for item in items:
                link = is_empty(
                    item.xpath('./span[@class="product-name-header"]/'
                               'a/@href').extract())
                res_item = SiteProductItem()
                yield link, res_item
        else:
            links = re.findall(
                r'<a href=\\"(\/p\/\d+)\\"',
                response.body_as_unicode().replace('\u003c',
                                                   '<').replace('\u003e', '>'))
            if links:
                links = list(set(links))
                for link in links:
                    res_item = SiteProductItem()
                    yield link, res_item
            else:
                self.log("Found no product links.".format(response.url), INFO)
Exemplo n.º 7
0
    def _scrape_product_links(self, response):
        data = WaitroseProductsSpider._get_data(response)
        for product_data in data['products']:
            product = SiteProductItem()

            for product_key, data_key in self._PRODUCT_TO_DATA_KEYS.items():
                value = product_data.get(data_key, 'null')
                if value != 'null':
                    product[product_key] = product_data[data_key]

            image_url = product.get('image_url', 'None')
            if image_url:
                product['image_url'] = urlparse.urljoin('http://', image_url)

            # This one is not in the mapping since it requires transformation.
            #product['upc'] = int(product_data['productid'])

            if product.get('price', None):
                price = product['price']
                price = price.replace('&pound;', 'p')
                price = re.findall('(p? *[\d ,.]+ *p?) *', price)
                price = price[0] if price else ''
                if price.endswith('p'):
                    price = '0.' + price.strip()
                if 'p' in price:
                    price = re.sub('[p ,]', '', price)
                    product['price'] = Price(priceCurrency='GBP', price=price)
                else:
                    self.log('Unknown price format at %s' % response)

            if not product.get('url', '').startswith('http'):
                product['url'] = urlparse.urljoin('http://www.waitrose.com',
                                                  product['url'])

            yield product['url'], product
Exemplo n.º 8
0
    def _scrape_product_links(self, response):
        links = response.xpath(
            "//div[@class='productSearchResults']"
            "/div[@id='ProductViewListGrid']"
            "/div[contains(@class,'product_item')]"
            "/*/*/div[@class='pl_productName']/h5/a/@href").extract()

        no_results = response.xpath(
            "//div[@class='searchResultsSummary']"
            "/h1/text()").re(r'.*We\'re sorry.*could not find.*')

        if no_results:
            links = []

        if not links:
            menu_links = response.xpath(
                "//div[@class='narrowResults']/div/ul/li/a/@href").extract()
            url = menu_links.pop(0)
            new_meta = response.meta.copy()
            new_meta['pages_wlinks'] = menu_links
            new_meta['count'] = 0
            yield Request(url, self._scrape_brand_links,
                          meta=new_meta), SiteProductItem()
            return

        if not links:
            self.log("Found no product links.", ERROR)

        for link in links:
            yield link, SiteProductItem()
Exemplo n.º 9
0
    def _parse_single_product(self, response):
        product = response.meta["product"]
        result = self._scrape_product_links(response)

        for p in result:
            for p2 in p:
                if isinstance(p2, SiteProductItem):
                    if "search_term" in p2:
                        del p2["search_term"]
                    product = SiteProductItem(
                        dict(p2.items() + product.items()))

        try:
            data = json.loads(response.body_as_unicode())
            item = data['items'][0]
            if item.get("images", {}).get("largeImage"):
                product["image_url"] = item.get("images").get("largeImage")
            product['upc'] = item['upcNumbers'][0]['upcNumber']
        except (IndexError, ValueError):
            pass

        product_id = re.findall('itemid=(\d+)', response.url)
        if product_id:
            url = self.REVIEW_URL % product_id[0]
            meta = {'product': product}
            return Request(url=url, meta=meta, callback=self._parse_review)

        return product
Exemplo n.º 10
0
    def set_zip_code(self, response):
        zip_code_stage = response.meta.get('zip_code_stage')
        self.log("zip code stage: %s" % zip_code_stage, DEBUG)
        if zip_code_stage == 1:
            new_meta = response.meta.copy()
            new_meta['zip_code_stage'] = 2
            request = Request(
                url=self.STORES_JSON.format(zip_code=self.zip_code),
                callback=self.set_zip_code,
                headers={'X-Crawlera-Cookies': 'disable'},
                meta=new_meta)
            yield request

        elif zip_code_stage == 2:
            stores_json = json.loads(response.body)
            near_store = stores_json['Location'][0]
            new_meta = response.meta.copy()
            new_meta['zip_code_stage'] = 3
            request = Request(
                url=self.SELECT_STORE.format(key=near_store['KEY']),
                headers={'X-Crawlera-Cookies': 'disable'},
                callback=self.set_zip_code,
                meta=new_meta)
            yield request

        else:
            for st in self.searchterms:
                yield Request(
                    self.url_formatter.format(
                        self.SEARCH_URL,
                        search_term=urllib.quote_plus(st.encode('utf-8')),
                    ),
                    headers={'X-Crawlera-Cookies': 'disable'},
                    meta={
                        'search_term': st,
                        'remaining': self.quantity
                    },
                )

            if self.product_url:
                prod = SiteProductItem()
                prod['is_single_result'] = True
                prod['url'] = self.product_url
                prod['search_term'] = ''
                yield Request(self.product_url,
                              self._parse_single_product,
                              headers={'X-Crawlera-Cookies': 'disable'},
                              meta={'product': prod})

            if self.products_url:
                urls = self.products_url.split('||||')
                for url in urls:
                    prod = SiteProductItem()
                    prod['url'] = url
                    prod['search_term'] = ''
                    yield Request(url,
                                  self._parse_single_product,
                                  headers={'X-Crawlera-Cookies': 'disable'},
                                  meta={'product': prod})
Exemplo n.º 11
0
    def _scrape_product_links(self, response):
        # To populate the description, fetching the product page is necessary.

        if self.user_agent_key not in ["desktop", "default"]:
            links = response.xpath(
                '//section[contains(@class,"product_listed")]'
                '//div[contains(@class,"product_info")]//a/@href').extract()

            if not links:
                self.log(
                    "[Mobile] Found no product data on: %s" % response.url,
                    ERROR)

            for link in links:
                yield urlparse.urljoin(response.url, link), SiteProductItem()
        else:
            url = response.url

            # This will contain everything except for the URL and description.
            product_jsons = response.xpath(
                '//meta[@name="productdata"]/@content').extract()

            if not product_jsons:
                self.log("Found no product data on: %s" % url, ERROR)

            product_links = response.css(
                ".product > .desc > h2 > a ::attr('href')").extract()
            if not product_links:
                self.log("Found no product links on: %s" % url, ERROR)

            for product_json, product_link in zip(product_jsons[0].split('|'),
                                                  product_links):
                prod = SiteProductItem()
                cond_set_value(prod, 'url',
                               urlparse.urljoin(url, product_link))

                product_data = json.loads(product_json)

                cond_set_value(prod, 'price', product_data.get('price'))
                cond_set_value(prod, 'image_url',
                               product_data.get('mediumImage'))

                #prod['upc'] = product_data.get('productId')
                if prod.get('price', None):
                    prod['price'] = Price(price=str(prod['price']).replace(
                        ',', '').strip(),
                                          priceCurrency='GBP')

                try:
                    brand, title = self.brand_from_title(product_data['name'])
                    cond_set_value(prod, 'brand', brand)
                    cond_set_value(prod, 'title', title)
                except KeyError:
                    raise AssertionError(
                        "Did not find title or brand from JS for product: %s" %
                        product_link)

                yield None, prod
Exemplo n.º 12
0
 def _scrape_product_links(self, response):
     for box in self._fetch_product_boxes(response):
         url = urlparse.urljoin(response.url, self._link_from_box(box))
         product = SiteProductItem()
         self._populate_from_box(response, box, product)
         if not product.get('brand', None):
             dump_url_to_file(response.url)
         meta = response.meta.copy()
         meta['product'] = product
         user_agent = USER_AGENT_LIST.pop(0)
         USER_AGENT_LIST.append(user_agent)
         request = Request(url, callback=self.parse_product, meta=meta)
         request.headers.setdefault('User-Agent', user_agent)
         yield request, product
Exemplo n.º 13
0
    def _scrape_product_links(self, response):
        products = response.xpath('//ol[contains(@class, "search-results")]'
                                  '//div[contains(@class, "sc_result_list")]')
        if not products:
            self.log("Found no product links.", ERROR)

        for product in products:
            prod_links = product.xpath(
                './/div[contains(@class, "sc_result_title")]//a/@href'
            ).extract()
            if not prod_links:
                self.log(
                    "Failed to extract product link for item: %r" %
                    (product.extract(), ), ERROR)
                continue
            prod_link = prod_links[0]

            item = SiteProductItem()

            cond_set(
                item,
                'title',
                product.css('div.sc_result_title a::text').extract(),
                conv=string.strip,
            )

            cond_set(
                item,
                'price',
                product.css('div.sc_result_price::text').re('(\d.+)'),
            )
            if item.get('price', None):
                if not '€' in item['price']:
                    self.log('Unknown currency at' % response.url)
                else:
                    item['price'] = Price(price=item['price'].replace(
                        ',', '').replace('€', '').strip(),
                                          priceCurrency='EUR')

            cond_set(
                item,
                'locale',
                response.xpath('//html/@lang').extract(),
                conv=string.strip,
            )
            cond_set_value(item, 'locale', 'fr-FR')  # Default.

            yield prod_link, item
Exemplo n.º 14
0
    def start_requests(self):  # Stolen from walmart
        """Generate Requests from the SEARCH_URL and the search terms."""
        #settings.overrides['CRAWLERA_ENABLED'] = True
        self.url_formatter.defaults['page'] = 1
        for st in self.searchterms:
            yield Request(
                self.url_formatter.format(self.SEARCH_URL,
                                          search_term=urllib.quote_plus(
                                              st.encode('utf-8')),
                                          sort_mode=self.sort_mode),
                meta={
                    'search_term': st,
                    'remaining': self.quantity,
                    # 'dont_redirect': True, 'handle_httpstatus_list': [302],
                    'page': 1,
                },
                headers={"User-Agent": self.user_agent},
                dont_filter=True,
                cookies={
                    'shippingCountry': 'US',
                    'currency': 'USD'
                })

        if self.product_url:
            prod = SiteProductItem()
            prod['is_single_result'] = True
            prod['url'] = self.product_url
            yield Request(self.product_url,
                          self._parse_single_product,
                          meta={'product': prod},
                          dont_filter=True)
Exemplo n.º 15
0
    def _scrape_product_links(self, response):
        urls = response.xpath(
            "//ul[contains(@class,'products-grid')]/li//a[contains(@class, 'product-image')]/@href"
        ).extract()
        urls = [
            urlparse.urljoin(response.url, x) if x.startswith('/') else x
            for x in urls
        ]

        if not urls:
            self.log("Found no product links.", DEBUG)

        # parse shelf category
        shelf_categories = response.xpath(
            '//div[@class="card_container"]//div[contains(@class, "no_gutter")]//a/@href'
        ).extract()
        shelf_categories = [category.strip() for category in shelf_categories]
        shelf_categories = filter(None, shelf_categories)
        try:
            shelf_name = response.xpath(
                '//meta[@name="og:title"]/@content').extract()[0].strip()
        except IndexError:
            pass
        for url in urls:
            item = SiteProductItem()
            if shelf_categories:
                item['shelf_name'] = shelf_name
                item['shelf_path'] = shelf_categories[1:]
            yield url, item
Exemplo n.º 16
0
    def start_requests(self):
        """Generate Requests from the SEARCH_URL and the search terms."""
        self.url_formatter.defaults['page_no'] = 1
        for st in self.searchterms:
            search_term, search_term_upper = self.generate_search_terms(st)
            yield Request(
                self.url_formatter.format(
                    self.SEARCH_URL,
                    search_term=search_term,
                    search_term_upper=search_term_upper,
                ),
                meta={
                    'search_term': st,
                    'remaining': self.quantity,
                    'page': 1
                },
                dont_filter=True,
            )

        if self.product_url:
            prod = SiteProductItem()
            prod['is_single_result'] = True
            prod['url'] = self.product_url
            yield Request(self.product_url,
                          self._parse_single_product,
                          meta={'product': prod},
                          dont_filter=True)
Exemplo n.º 17
0
    def _get_json_data(self, item):
        product = SiteProductItem()
        item = item['productInfo']

        if 'salePrice' in item['priceInfo']:
            price = re.findall('(/?\d+.\d+)',
                               item['priceInfo']['salePrice'])
            if len(price) == 1:
                product['price'] = Price(price=float(price[0]),
                                         priceCurrency='USD')
            else:
                product['price'] = Price(price=float(price[-1]),
                                         priceCurrency='USD')
        elif 'regularPrice' in item['priceInfo']:
            price = re.findall('(/?\d+.\d+)',
                               item['priceInfo']['regularPrice'])
            if len(price) == 1:
                product['price'] = Price(price=float(price[0]),
                                         priceCurrency='USD')
            else:
                product['price'] = Price(price=float(price[-1]),
                                         priceCurrency='USD')

        messages = item.get('channelAvailability', [])
        for mes in messages:
            if 'displayText' in mes:
                if 'Not sold online' in mes['displayText']:
                    product['is_in_store_only'] = True
                if 'Out of stock online' in mes['displayText']:
                    product['is_out_of_stock'] = True

        upc = item.get('upc')
        cond_set_value(product, 'upc', upc)

        return product
Exemplo n.º 18
0
    def _scrape_product_links(self, response):
        if self.force_tires:
            links = response.xpath(
                "//ul[@id='productList']/li/div[@class='productImage']/a/@href"
            ).extract()
        else:
            text = response.body_as_unicode()
            m = re.match(r'^SRPInitialLoad\((.*)\)', text, re.DOTALL)
            links = None
            if m:
                jstext = m.group(1)
                try:
                    jsdata = json.loads(jstext)
                except ValueError:
                    jsdata = {}

                self.jsdata = jsdata
                links = self.jsdata.get('results')

                def full_ct_url(url):
                    return urlparse.urljoin('http://www.canadiantire.ca/', url)

                links = [
                    full_ct_url(x['field']['short-pdp-url'] + '.html')
                    for x in links
                ]

        if not links:
            self.log("Found no product links.", ERROR)

        for link in links:
            yield link, SiteProductItem()
Exemplo n.º 19
0
 def _get_products(self, response):
     product_list = None
     try:
         body = json.loads(response.body)
         for lane in body['_embedded']['lanes']:
             _type = lane.get('type')
             if not _type or _type != 'SearchLane':
                 continue
             product_list = lane['_embedded']['items']
             break
         if not product_list:
             self.log('Products was not found.', DEBUG)
             return
         for product_info in product_list:
             product = self.__parse_product(
                 SiteProductItem(), product_info['_embedded']['product'])
             if not product:
                 continue
             product['url'] = \
                 self.BASE_URL + product_info['navItem']['link']['href']
             product['reseller_id'] = self._parse_reseller_id(
                 product.get('url', ''))
             yield product
     except Exception as e:
         self.log('Can\'t parse product list body. ERROR: %s.' % str(e),
                  ERROR)
         return
Exemplo n.º 20
0
    def parse_product(self, response):
        product = response.meta.get('product') if response.meta.get('product') else SiteProductItem()
        cond_set_value(product, 'shelf_path', response.meta.get('shelf_path'))
        cond_set_value(product, 'shelf_name', response.meta.get('shelf_name'))
        title = response.xpath('//h1/span/text()').extract()[0].strip()
        cond_set_value(product, 'title', title)
        data_body = response.xpath('//script[contains(text(), '
                                   '"merchantID")]/text()').extract()
        try:
            asin = re.findall(r'"ASIN" : "(\w+)"', data_body[0])[0]
        except IndexError:
            asin = re.findall('\/([A-Z0-9]{10})', response.url)[0]
        cond_set_value(product, 'asin', asin)
        cond_set_value(product, 'url', response.url)
        cond_set_value(product, 'ranking', response.meta.get('ranking'))
        brand = self._parse_brand(response)
        cond_set_value(product, 'brand', brand)
        price = self._parse_price(response)
        cond_set_value(product, 'price', price)

        if self.match_target or self.match_walmart:
            req = Request(url='http://asintoupc.com', callback=self.get_payload, dont_filter=True)
            req.meta['product'] = product
            yield req
        else:
            yield product
    def _scrape_product_links(self, response):
        urls = response.xpath(
            '//li[contains(@class,"productDisplay")]//div[@class="productDisplay_image"]/a/@href'
        ).extract()

        try:
            products = re.findall(
                'var\s?filterResults\s?=\s?jq\.parseJSON\([\'\"](\{.+?\})[\'\"]\);', response.body, re.MULTILINE)[0].decode(
                'string-escape')
            products = json.loads(products).get('organicZoneInfo').get('records')

            urls += [product.get('pdpUrl') for product in products]
        except Exception as e:
            self.log('Error loading JSON: %s at URL: %s' % (str(e), response.url), WARNING)
            self.log('Extracted urls using xpath: %s' % (len(urls)), WARNING)

        ulrs = [urlparse.urljoin(response.url, url) for url in urls]

        shelf_categories = response.xpath(
            '//*[contains(@data-anid, "breadcrumbIndex_")]/text()').extract()
        shelf_category = shelf_categories[-1] if shelf_categories else None

        for url in urls:
            item = SiteProductItem()
            if shelf_categories:
                item['shelf_name'] = shelf_categories
            if shelf_category:
                item['shelf_path'] = shelf_category
            yield url, item
 def _scrape_product_links(self, response):
     links = response.xpath(
         '//ul[contains(@class,"search-result-items")]/li/a/@href').extract(
         )
     if not links:
         links = response.xpath('//a[@class="name-link"]/@href').extract()
     if links:
         for i in range(len(links)):
             if self.ROOT_URL not in links[i]:
                 links[i] = 'http://' + self.ROOT_URL + links[i]
     cats = response.xpath('.//link[@rel="canonical"]/@href').extract()
     shelf_categories = []
     shelf_category = ''
     if cats:
         shelf_categories = [
             c.strip() for c in cats[0].split('/') if len(c.strip()) > 1
         ]
         shelf_category = shelf_categories[-1] if shelf_categories else None
     for item_url in links:
         item = SiteProductItem()
         if shelf_category:
             item['shelf_name'] = shelf_category
         if shelf_categories:
             item['shelf_path'] = shelf_categories
         yield item_url, item
Exemplo n.º 23
0
    def _scrape_product_links(self, response):
        boxes = self._fetch_product_boxes(response)
        for box in boxes:

            # Fetch product url
            try:
                url = self._link_from_box(box)
            except IndexError:  # Most expected
                self.log('IndexError on %s' % response.url, ERROR)
                url = None
            if self.REQUIRE_PRODUCT_PAGE and url is None:
                self.log('No link found for product on %s' % response.url,
                         DEBUG)

            product = SiteProductItem()
            meta = self._populate_from_box(response, box, product)
            self._populate_hardcoded_fields(product)
            self._get_model_from_title(product)

            new_meta = response.meta.copy() if hasattr(response, 'meta') \
                else {}
            if meta and url:
                new_meta.update(meta)
            if url:
                new_meta['product'] = product
                yield Request(urlparse.urljoin(response.url, url),
                              self.parse_product,
                              meta=new_meta,
                              errback=self._handle_product_page_error), product
            else:
                yield None, product
Exemplo n.º 24
0
 def _scrape_product_links(self, response):
     boxes = response.css('.product-description')
     for box in boxes:
         product = SiteProductItem()
         url = box.xpath('h3/a/@href').extract()
         cond_set(product, 'brand', box.xpath('p/text()').extract())
         yield url[0], product
    def _scrape_product_links(self, response):
        urls = response.xpath(
            "//div[contains(@class,'product') "
            "and contains(@class,'plp-grid')]"
            "//descendant::a[contains(@class, 'item_description')]/@href"
        ).extract()
        urls = [
            urlparse.urljoin(response.url, x) if x.startswith('/') else x
            for x in urls
        ]

        if not urls:
            self.log("Found no product links.", DEBUG)

        # parse shelf category
        shelf_categories = response.xpath(
            '//ul[@id="headerCrumb"]/li//text()').extract()
        shelf_categories = [category.strip() for category in shelf_categories]
        shelf_categories = filter(None, shelf_categories)
        try:
            shelf_name = response.xpath(
                '//h1[@class="page-title"]/text()').extract()[0].strip()
        except IndexError:
            pass
        for url in urls:
            if url in self.product_filter:
                continue
            self.product_filter.append(url)
            item = SiteProductItem()
            if shelf_categories:
                if shelf_categories:
                    item['shelf_name'] = shelf_name
                    item['shelf_path'] = shelf_categories[1:]
            yield url, item
    def _scrape_product_links(self, response):
        shelf_categories = response.meta.get('shelf_categories')
        """
        Scraping product links from search page
        """
        links = response.xpath(
            './/a[contains(@class, "product")]/@href'
        ).extract()
        if links:
            if not shelf_categories:
                shelf_categories = self._get_shelf_path(response)
            shelf_category = shelf_categories[-1] if shelf_categories else None
            for link in links:
                # sometimes there is link to category instead of a product like here:
                # https://www.microsoftstore.com/store/msusa/en_US/cat/Microsoft-Lumia/categoryID.66852000?icid=en_US_Homepage_whatsnew_5_TEST_EDU_160525
                if '/pdp/' not in link:
                    self.log("Found shelf link instead of product link {url}".format(url=link), INFO)
                else:
                    item = SiteProductItem()
                    if shelf_category:
                        item['shelf_name'] = shelf_category
                    if shelf_categories:
                        item['shelf_path'] = shelf_categories
                    yield urlparse.urljoin(response.url, link), item

        else:
            self.log("Found no product links in {url}".format(url=response.url), INFO)
Exemplo n.º 27
0
    def _scrape_product_links(self, response):
        links = response.xpath('//h4[@class="productTitle"]/a/@href').extract()
        if not links:
            self.log("Found no product links.", ERROR)

        for link in links:
            yield link, SiteProductItem()
Exemplo n.º 28
0
    def start_requests(self):
        for st in self.searchterms:
            yield Request(
                self.url_formatter.format(
                    self.SEARCH_URL,
                    search_term=urllib.quote_plus(st.encode('utf-8')),
                ),
                meta={
                    'search_term': st,
                    'remaining': self.quantity
                },
            )

        if self.product_url:
            pId = is_empty(re.findall("product/.*/(\d+)", self.product_url))
            url = "http://groceries.asda.com/api/items/view?" \
                "itemid=" + pId + "&responsegroup=extended" \
                "&cacheable=true&shipdate=currentDate" \
                "&requestorigin=gi"

            prod = SiteProductItem()
            prod['is_single_result'] = True
            prod["url"] = self.product_url
            prod["reseller_id"] = pId
            yield Request(url,
                          self._parse_single_product,
                          meta={'product': prod})
Exemplo n.º 29
0
    def start_requests(self):
        cookies = {'pageTemplate': 'new'}
        for st in self.searchterms:
            url = self.url_formatter.format(self.SEARCH_URL,
                                            search_term=urllib.quote_plus(
                                                st.encode('utf-8')),
                                            start=0,
                                            sort_mode=self.SORTING or '')
            yield Request(url,
                          meta={
                              'search_term': st,
                              'remaining': self.quantity
                          },
                          cookies=cookies)

        if self.product_url:
            prod = SiteProductItem()
            prod['is_single_result'] = True
            yield Request(self.product_url,
                          self._parse_single_product,
                          meta={
                              'product': prod,
                              'handle_httpstatus_list': [404]
                          },
                          cookies=cookies)
    def _scrape_product_links(self, response):
        urls = response.xpath(
            '//a[contains(@property, "url")]/@href').extract()
        if not urls:
            urls = response.xpath(
                './/div[@class="product-info"]/a[contains(@class, "product-title")]/@href'
            ).extract()
        if not urls:
            urls = response.xpath(
                '//a[@class="product-title scTrack pfm"]/@href').extract()
        urls = [urlparse.urljoin(response.url, x) for x in urls]
        shelf_category = response.xpath('//h1/text()').extract()
        if shelf_category:
            shelf_category = shelf_category[0].strip(' \t\n')
        shelf_path = response.xpath(
            '//div[contains(@class, "stp--breadcrumbs")]/ul/li/a/text()'
            ' | //div[contains(@class, "stp--breadcrumbs")]/ul/li[@class="last"]/text()'
        ).extract()

        for url in urls:
            item = SiteProductItem()
            if shelf_category:
                item['shelf_name'] = shelf_category
            if shelf_path:
                item['shelf_path'] = shelf_path
            yield url, item