Python BeautifulSoup示例，product_spiders.spiders.BeautifulSoup.BeautifulSoup Python示例

示例#1

0

显示文件

文件： rvpartscenter.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        soup = BeautifulSoup(response.body)

        products = soup.findAll('a', href=re.compile('ProductDetail'))
        products = {product.parent.parent for product in products}

        for product in products:
            product_loader = ProductLoader(item=Product(), response=response)
            name = product.findAll('font')[1].text
            price = product.find('nobr', text=re.compile('\$'))
            url = product.find('a', href=re.compile('ProductDetail'))
            if url:
                url = urljoin_rfc(get_base_url(response), url['href'])
            else:
                url = response.url
            product_loader.add_value('name', name)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url)
            product_loader.add_value('url', url)
            product_loader.add_value('sku', response.meta['sku'])
            #product_loader.add_value('identifier', response.meta['sku'])
            site_mfrgid = product.find('nobr').text
            if site_mfrgid:
                site_mfrgid = site_mfrgid.strip().lower()
                mfrgid = response.meta['mfrgid'].strip().lower()
                if site_mfrgid == mfrgid:
                    yield product_loader.load_item()

示例#2

0

显示文件

    def collect_price(self, hxs, response):
        soup = BeautifulSoup(response.body)
        try:
            soup_form = soup.find(id='handleBuy')
            price = soup_form.find('b', 'priceLarge')
            if not price:
                price = soup_form.find('span', 'priceLarge')
            if not price:
                price = soup_form.find('span', 'price')
            if not price:
                price = soup_form.find('span', 'pa_price')
            if price:
                price = self.extract_price(price.text)
            else:
                price = None
        except:
            price = hxs.select('//div[@id="price"]//td[text()="Price:"]'
                               '/following-sibling::td/span/text()').extract()
            if not price:
                price = hxs.select(
                    '//span[@id="priceblock_saleprice"]/text()').extract()
            if not price:
                price = hxs.select(
                    '//span[@id="priceblock_ourprice"]/text()').extract()
            if not price:
                price = hxs.select(
                    '//span[@id="actualPriceValue"]/*[@class="priceLarge"]/text()'
                ).extract()

            if price:
                price = self.extract_price(price[0])
            else:
                price = None

        return price

示例#3

0

显示文件

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)
        soup = BeautifulSoup(response.body)
        products = soup.find('div', id='atfResults')
        if products:
            products = products.findAll('div', id=re.compile('result_\d+$'))
            meta = response.meta
            meta['next_results'] = []
            # next_page = hxs.select(u'//a[@class="pagnNext"]/@href').extract()
            next_page = []
            if next_page:
                next_page = urljoin_rfc(get_base_url(response), next_page[0])
                meta['next_page'] = next_page
            for product in products:
                url = product.find('a')['href']
                url = urljoin_rfc(get_base_url(response), url)
                meta['next_results'].append(url)

            first_url = meta['next_results'][0]
            meta['next_results'] = meta['next_results'][1:]
            yield Request(first_url,
                          callback=self.parse_product,
                          meta=meta,
                          dont_filter=True)
        else:
            log.msg('No products.')
            meta = response.meta
            if meta.get('search_urls'):
                search_url = meta['search_urls'][0]
                meta['search_urls'] = meta['search_urls'][1:]
                yield Request(search_url % {'q': meta['sku']}, meta=meta)

示例#4

0

显示文件

    def parse(self, response):

        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        brands = hxs.select("//section[@id='All']//ul[@class='BrandCol1']//a")
        soup = BeautifulSoup(response.body)

        for brand in brands:

            link = urljoin_rfc(
                base_url,
                brand.select("./@href").extract()[0].replace('//', '/'))
            log.msg(link)
            brand = brand.select("./text()").extract()[0]
            yield Request(url=link,
                          meta={'brand': brand},
                          callback=self.parse_brand,
                          priority=10)

        if soup.find('section', id='All'):
            brands = [(brand.text, brand['href'])
                      for brand in soup.find('section', id='All').findAll('a')]
            for brand, link in brands:
                yield Request(urljoin_rfc(base_url, link.replace('//', '/')),
                              meta={'brand': brand},
                              callback=self.parse_brand,
                              priority=10)

示例#5

0

显示文件

文件： amazon_spider.py 项目： ontiyonke/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            soup = BeautifulSoup(product.extract())
            loader.add_value(
                'name',
                soup.find('h3', attrs={
                    'class': 'newaps'
                }).findAll('span')[0].string)
            loader.add_value(
                'url',
                soup.find('h3', attrs={
                    'class': 'newaps'
                }).findAll('a')[0]['href'])
            loader.add_value(
                'price',
                soup.find('ul', attrs={
                    'class': 'rsltL'
                }).findAll('span')[0].string)
            #loader.add_value('sku', response.meta['sku'])
            #loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')) and \
               valid_price(response.meta['price'], loader.get_output_value('price')):
                pr = loader

        if pr:
            yield pr.load_item()

示例#6

0

显示文件

文件： hgvdirect.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select('//table[@class="productListing"]/tr')[1:]
        if len(products) < 20: # if the product list can not be parsed using lxml, use BeautifulSoup
            soup = BeautifulSoup(response.body)
            products = soup.find('table', {'class': 'productListing'}).findAll('tr')
            products = products[1:]
            for product in products:
                product_loader = ProductLoader(item=Product(), response=response)
                product = product.findAll('td')
                name = product[1].find('a').contents
                url = product[1].find('a')['href']
                price = product[2].text
                price = re.findall('[0-9\.]+', price)
                product_loader.add_value('name', name)
                product_loader.add_value('url', url)
                product_loader.add_value('price', price[0])
                yield product_loader.load_item()
                
        else:
            for product in products:
                product_loader = ProductLoader(item=Product(), selector=product)
                product_loader.add_xpath('name', './td[position()=2]/a/text()')
                product_loader.add_xpath('url', './td[position()=2]/a/@href')
                product_loader.add_xpath('price', './td[position()=3]', re='\xa3(.*[0-9])')
                yield product_loader.load_item()

示例#7

0

显示文件

文件： rvpartscenter.py 项目： ontiyonke/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        soup = BeautifulSoup(response.body)

        products = soup.findAll('a', href=re.compile('ProductDetail'))
        products = {product.parent.parent for product in products}

        for product in products:
            product_loader = ProductLoader(item=Product(), response=response)
            name = product.findAll('font')[1].text
            price = product.find('nobr', text=re.compile('\$'))
            url = product.find('a', href=re.compile('ProductDetail'))
            if url:
                url = urljoin_rfc(get_base_url(response), url['href'])
            else:
                url = response.url
            product_loader.add_value('name', name)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url)
            product_loader.add_value('url', url)
            product_loader.add_value('sku', response.meta['sku'])
            #product_loader.add_value('identifier', response.meta['sku'])
            site_mfrgid = product.find('nobr').text
            if site_mfrgid:
                site_mfrgid = site_mfrgid.strip().lower()
                mfrgid = response.meta['mfrgid'].strip().lower()
                if site_mfrgid == mfrgid:
                    yield product_loader.load_item()

示例#8

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select('//table[@class="productListing"]/tr')[1:]
        if len(products) < 20:  # if the product list can not be parsed using lxml, use BeautifulSoup
            soup = BeautifulSoup(response.body)
            products = soup.find('table', {'class': 'productListing'}).findAll('tr')
            products = products[1:]
            for product in products:
                product = product.findAll('td')
                name = product[1].find('a').contents
                url = product[1].find('a')['href']
                price = product[2].text
                price = re.findall('[0-9\.]+', price)
                if price[0] > 0:
                    product_loader = ProductLoader(item=Product(), response=response)
                    product_loader.add_value('name', name)
                    product_loader.add_value('url', url)
                    product_loader.add_value('price', price[0])
                    yield product_loader.load_item()
        else:
            for product in products:
                price = product.select('./td[position()=3]').re('\xa3(.*[0-9])')
                if price and price[0] > 0:
                    product_loader = ProductLoader(item=Product(), selector=product)
                    product_loader.add_xpath('name', './td[position()=2]/a/text()')
                    product_loader.add_xpath('url', './td[position()=2]/a/@href')
                    product_loader.add_xpath('price', './td[position()=3]', re='\xa3(.*[0-9])')
                    yield product_loader.load_item()

示例#9

0

显示文件

文件： argonaut_liquor.py 项目： 0--key/lib

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES)

        # categories
        # categories = hxs.select(u'//td[@id="left"]//a/@href').extract()
        # try:
            # categories = [a['href'] for a in soup.find('td', id='left').findAll('a')]
        # except AttributeError:
            # categories = []
        # for url in categories:
            # url = urljoin_rfc(get_base_url(response), url)
            # yield Request(url)

        # pagination
        next_page = hxs.select(u'//div[@class="pager"]/a[contains(text(),"Next")]/@href').extract()
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            if '127.0.0.1' in next_page:
                next_page = next_page.replace('127.0.0.1', 'argonautliquor.com')
            yield Request(next_page, dont_filter=True)
        else:
            next_page = soup.find(lambda tag: tag.name == 'a' and 'Next' in tag.text and tag.findParent('div', 'pager'))
            if next_page:
                next_page = urljoin_rfc(get_base_url(response), next_page['href'])
                if '127.0.0.1' in next_page:
                    next_page = next_page.replace('127.0.0.1', 'argonautliquor.com')
                yield Request(next_page, dont_filter=True)

        # products
        for product in self.parse_product(response):
            yield product

示例#10

0

显示文件

文件： shoemetro_spider.py 项目： oceancloud82/scraping

 def parse_items(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     cur_page = hxs.select('//span[@class="currentPage"]/text()').extract()
     if cur_page and (int(cur_page[0]) != response.meta['cur']) and (
             response.meta['attempt'] < 5):
         log.msg('WRONG PAGE! ONE MORE ATTEMPT to ' + response.url)
         yield Request(response.url + '&at=' +
                       str(response.meta['attempt']),
                       meta={
                           'cur': response.meta['cur'],
                           'attempt': response.meta['attempt'] + 1
                       },
                       dont_filter=True,
                       callback=self.parse_items)
         return
     soup = BeautifulSoup(response.body)
     products = [
         a['href'] for a in
         soup.findAll(lambda tag: tag.name == 'a' and tag.findChild('b') and
                      tag.findParent('td', {'colspan': 2}))
     ]
     for url in products:
         url = urljoin_rfc(get_base_url(response), url)
         yield Request(url, callback=self.parse_product)
     """trs = hxs.select('//div[@id="mainContent"]//table[@style="height:100%"]/tr')

示例#11

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()')
        loader.add_value('url', response.url)

        loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src')
        if not loader.get_output_value(u'image_url'):
            soup = BeautifulSoup(response.body)
            image_url = soup.find(lambda tag: tag.name == u'img' and tag.findParent(u'tr', id=u'prodImageContainer'))
            if image_url:
                loader.add_value('image_url', image_url.get(u'src'))

        loader.add_xpath('brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()')

        loader.add_xpath('price', u'//b[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="price"]/text()')

        loader.add_value('sku', response.meta['sku'])
        loader.add_value('identifier', response.meta['sku'].lower())
        yield loader.load_item()

示例#12

0

显示文件

文件： amazon_spider_us.py 项目： oceancloud82/scraping

    def parse(self, response):
        soup = BeautifulSoup(response.body)
        next_page = soup.find('a', 'pagnNext')
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page['href'])
            yield Request(next_page, meta=response.meta)

        hxs = HtmlXPathSelector(response)

        next_page = hxs.select('//a[@id="pagnNextLink"]/@href').extract()
        if next_page:
            yield Request(urljoin_rfc(get_base_url(response), next_page[0]),
                          meta=response.meta)

        products = soup.findAll('div', id=re.compile(u'^result_.*'))
        for product in products:
            # parent_expressions = (lambda tag: tag.name == 'h3' and tag.get('class') == 'title',
            #                      lambda tag: tag.name == 'div' and tag.get('class') == 'productTitle')
            url = product.find('h3', 'newaps').find('a') if product.find(
                'h3', 'newaps') else ''
            if url:
                url = urljoin_rfc(get_base_url(response), url['href'])
                yield Request(url,
                              meta=response.meta,
                              callback=self.parse_options)

        for result in hxs.select(
                u'//div[@id="atfResults" or @id="btfResults"]//div[starts-with(@id, "result_")]'
        ):
            try:
                url = result.select(u'.//h3/a/@href').extract()[0]
            except:
                continue
            yield Request(url, meta=response.meta, callback=self.parse_options)

示例#13

0

显示文件

文件： argonaut_liquor.py 项目： 0--key/lib

 def parse_brands(self, response):
     soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES)
     brands = [ a['href'] for a in soup.find('td', id='main').findAll('a') if 'producers' in a['href'] ]
     for url in brands:
         url = urljoin_rfc(get_base_url(response), url)
         if '127.0.0.1' in url:
             url = url.replace('127.0.0.1', 'argonautliquor.com')
         yield Request(url, dont_filter=True)

示例#14

0

显示文件

文件： visiondirect.py 项目： oceancloud82/scraping

    def parse_category(self, response):
        soup = BeautifulSoup(response.body)
        products = soup.findAll('a', {'class': 'products-list__item'})
        for product_url in products:
            yield Request(product_url['href'], callback=self.parse_product)

        identifier = re.search('"product_id":"([^"]*)"',
                               response.body_as_unicode())
        if not products and identifier:
            for item in self.parse_product(response):
                yield item

示例#15

0

显示文件

 def parse(self, response):
     if not isinstance(response, HtmlResponse):
         return
     hxs = HtmlXPathSelector(response)
     soup = BeautifulSoup(response.body)
     products = soup.find('div', id='atfResults')
     if products:
         products = products.findAll('div', id=re.compile('result_\d+$'))
         url = products[0].find('a')['href']
         url = urljoin_rfc(get_base_url(response), url)
         yield Request(url, meta=response.meta, callback=self.parse_product)

示例#16

0

显示文件

文件： amazon_spider_us.py 项目： oceancloud82/scraping

    def parse_review(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        soup = BeautifulSoup(response.body)
        product = response.meta['product']

        reviews = hxs.select(
            u'//table[@id="productReviews"]//div[@style="margin-left:0.5em;"]')

        if not reviews:
            yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=hxs,
                                  date_format=u'%d/%m/%Y')
            date = review.select(u'.//nobr/text()')[0].extract()
            res = None
            date_formats = (u'%B %d, %Y', u'%d %b %Y', u'%d %B %Y')
            for fmt in date_formats:
                try:
                    res = time.strptime(date, fmt)
                except ValueError:
                    pass
                if res:
                    break
            date = time.strftime(u'%d/%m/%Y', res)
            loader.add_value('date', date)

            rating = review.select(u'.//text()').re(
                u'([\d\.]+) out of 5 stars')[0]
            rating = int(float(rating))
            loader.add_value('rating', rating)
            loader.add_value('url', response.url)

            title = review.select(u'.//b/text()')[0].extract()
            text = ''.join([
                s.strip() for s in review.select(
                    u'div[@class="reviewText"]/text()').extract()
            ])
            loader.add_value('full_text', u'%s\n%s' % (title, text))

            product['metadata']['reviews'].append(loader.load_item())

        next_page = soup.find('a', text=re.compile('Next'))
        if next_page and next_page.parent.get('href'):
            next_page = next_page.parent['href']
            yield Request(urljoin_rfc(base_url, next_page),
                          meta=response.meta,
                          callback=self.parse_review)
        else:
            yield product

示例#17

0

显示文件

文件： argonaut_liquor.py 项目： ontiyonke/lib

 def parse_brands(self, response):
     soup = BeautifulSoup(response.body,
                          convertEntities=BeautifulSoup.HTML_ENTITIES)
     brands = [
         a['href'] for a in soup.find('td', id='main').findAll('a')
         if 'producers' in a['href']
     ]
     for url in brands:
         url = urljoin_rfc(get_base_url(response), url)
         if '127.0.0.1' in url:
             url = url.replace('127.0.0.1', 'argonautliquor.com')
         yield Request(url, dont_filter=True)

示例#18

0

显示文件

文件： visiondirect.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        soup = BeautifulSoup(response.body)
        try:
            price = soup.find('span', {'class': 'price ours'}).text
        except AttributeError:
            self.log('price not found {}'.format(response.url))
            return

        image_url = soup.find('img', itemprop='image')['src']
        identifier = soup.find('form', id='product_addtocart_form')
        identifier = identifier['action'].split('product/')[-1].split('/')[0]
        loader.add_value('image_url', image_url)
        loader.add_value('price', price)
        name = soup.find('h1', itemprop='name').text.strip()
        loader.add_value('name', name)
        loader.add_value('category', response.meta.get('category', ''))
        brand = soup.find('span', itemprop='manufacturer').text.replace('&nbsp;', '').split('by', 1)[1].strip()
        loader.add_value('brand', brand)
        loader.add_value('url', response.url)
        sku = soup.find('input', id='eye')
        loader.add_value('identifier', identifier)
        if sku:
            loader.add_value('sku', sku['value'])
        shipping_cost = '5.98'
        if loader.get_output_value('price') <= Decimal(59):
            shipping_cost = '9.98'
        loader.add_value('shipping_cost', shipping_cost)
        yield loader.load_item()

示例#19

0

显示文件

文件： visiondirect.py 项目： oceancloud82/scraping

    def parse(self, response):
        soup = BeautifulSoup(response.body)

        categories = response.xpath(
            '//div[contains(@class, "menu")]/@data-href').extract()
        categories += response.xpath(
            '//ul[contains(@class, "menu")]//a/@href').extract()
        for cat_url in categories:
            yield Request(response.urljoin(cat_url),
                          callback=self.parse_category)

        categories = soup.findAll('a', {'class': 'link'})
        for cat_url in categories:
            yield Request(response.urljoin(cat_url['href']),
                          callback=self.parse_category)

示例#20

0

显示文件

    def parse_brand(self, response):
        hxs = HtmlXPathSelector(response)
        # if nothing found try to reload page
        if hxs.select('//div[@class="detailPageTitle"][text()="Viewing 0"]'):
            req = self.retry(response)
            if req:
                yield req
            return

        soup = BeautifulSoup(response.body)

        products = hxs.select('//ul[@class="stockthumbwrapper"]')
        for p in products:
            url = p.xpath(
                './/li[@class="productThumbName"]/a/@href')[0].extract()
            meta = response.meta.copy()
            promo = p.xpath(
                './/li[@class="productThumbImage"]//img[contains(@class,"cornerImgFormat2 discount")]/@alt'
            ).extract()
            meta['promotions'] = promo[0] if promo else ''
            yield Request(urljoin(get_base_url(response), url),
                          callback=self.parse_product,
                          meta=response.meta)

        for p in soup.findAll('ul', 'stockthumbwrapper'):
            url = p.find('li', 'productThumbName').find('a')['href']
            meta = response.meta.copy()
            promo = p.find('li', 'productThumbImage').find(
                'img',
                attrs={'class': re.compile('cornerImgFormat2 discount')})
            meta['promotions'] = promo['alt'] if promo else ''
            yield Request(urljoin(get_base_url(response), url),
                          callback=self.parse_product,
                          meta=meta)

        pages = soup.findAll('div', id='pagenumber')
        if pages:
            for page in set(pages[0].findAll('a')):
                yield Request(response.urljoin(page),
                              meta=response.meta,
                              callback=self.parse_brand)

        for page in set(
                hxs.select('//div[@id="pagenumber"][1]/a/@href').extract()):
            yield Request(response.urljoin(page),
                          meta=response.meta,
                          callback=self.parse_brand)

示例#21

0

显示文件

文件： amazonuk.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('price', u'//b[@class="priceLarge"]/text()')
        loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src')
        if not loader.get_output_value(u'image_url'):
            soup = BeautifulSoup(response.body)
            image_url = soup.find(lambda tag: tag.name == u'img' and tag.findParent(u'tr', id=u'prodImageContainer'))
            if image_url:
                loader.add_value('image_url', image_url.get(u'src'))

        loader.add_xpath('brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="price"]/text()')
        partn = hxs.select(u'//span[@class="tsLabel" and contains(text(),"Manufacturer Part Number")]/following-sibling::span/text()').extract()
        if not partn:
            partn = hxs.select(u'//tr/td[contains(text(),"Manufacturer Part Number")]/following-sibling::td/text()').extract()
        partn = partn[0].strip()
        log.msg('PARTN: [%s == %s]' % (partn.lower(), response.meta['partn'].lower()))
        log.msg('SKU: [%s == %s]' % (partn.lower(), response.meta['sku'].lower()))
        sold_by = hxs.select(u'//div[contains(text(),"Sold by")]/b/text()').extract()
        sold_by = sold_by[0].strip() if sold_by else u''
        log.msg(u'Sold by: %s' % sold_by)
        if (partn.lower() == response.meta['partn'].lower() or partn.lower() == response.meta['sku'].lower()) and sold_by != u'Towequipe':
            loader.add_value('sku', response.meta['partn'])
            loader.add_value('identifier', response.meta['partn'].lower())
            # if loader.get_output_value('price'):
            yield loader.load_item()
        else:
            meta = response.meta
            next_result = meta['next_results']
            if next_result:
                next_result = next_result[0]
                meta['next_results'] = meta['next_results'][1:]
                yield Request(next_result, callback=self.parse_product, meta=response.meta)
            elif meta.get('next_page'):
                next_page = meta['next_page']
                yield Request(next_page, meta=response.meta)

示例#22

0

显示文件

文件： argonaut_liquor.py 项目： ontiyonke/lib

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        soup = BeautifulSoup(response.body,
                             convertEntities=BeautifulSoup.HTML_ENTITIES)

        # categories
        # categories = hxs.select(u'//td[@id="left"]//a/@href').extract()
        # try:
        # categories = [a['href'] for a in soup.find('td', id='left').findAll('a')]
        # except AttributeError:
        # categories = []
        # for url in categories:
        # url = urljoin_rfc(get_base_url(response), url)
        # yield Request(url)

        # pagination
        next_page = hxs.select(
            u'//div[@class="pager"]/a[contains(text(),"Next")]/@href').extract(
            )
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            if '127.0.0.1' in next_page:
                next_page = next_page.replace('127.0.0.1',
                                              'argonautliquor.com')
            yield Request(next_page, dont_filter=True)
        else:
            next_page = soup.find(lambda tag: tag.name == 'a' and 'Next' in tag
                                  .text and tag.findParent('div', 'pager'))
            if next_page:
                next_page = urljoin_rfc(get_base_url(response),
                                        next_page['href'])
                if '127.0.0.1' in next_page:
                    next_page = next_page.replace('127.0.0.1',
                                                  'argonautliquor.com')
                yield Request(next_page, dont_filter=True)

        # products
        for product in self.parse_product(response):
            yield product

示例#23

0

显示文件

文件： amazon_spider.py 项目： 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            soup = BeautifulSoup(product.extract())
            loader.add_value('name', soup.find('h3', attrs={'class': 'newaps'}).findAll('span')[0].string)
            loader.add_value('url', soup.find('h3', attrs={'class': 'newaps'}).findAll('a')[0]['href'])
            loader.add_value('price', soup.find('ul', attrs={'class': 'rsltL'}).findAll('span')[0].string)
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price'): 
                if (pr is None or pr.get_output_value('price') > loader.get_output_value('price')):
                    if valid_price(response.meta['price'], loader.get_output_value('price')):
                        pr = loader

        if pr:
            yield pr.load_item()

示例#24

0

显示文件

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        # pages
        next_page = hxs.select('//a[contains(text(), "Next")]/@href').extract()
        if next_page:
            url = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(url)
        else:
            soup = BeautifulSoup(response.body)
            next_page = soup.find('a', text=re.compile('.*Next.*'))
            if next_page:
                url = urljoin_rfc(get_base_url(response), next_page.parent['href'])
                yield Request(url)

        # products
        for product in self.parse_product(response):
            yield product

示例#25

0

显示文件

文件： hgvdirect.py 项目： 0--key/lib

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        # pages
        next_page = hxs.select('//a[contains(text(), "Next")]/@href').extract()
        if next_page:
            url = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(url)
        else:
            soup = BeautifulSoup(response.body)
            next_page = soup.find('a', text=re.compile('.*Next.*'))
            if next_page:
                url = urljoin_rfc(get_base_url(response), next_page.parent['href'])
                yield Request(url)

        # products
        for product in self.parse_product(response):
            yield product

示例#26

0

显示文件

文件： amazonuk.py 项目： oceancloud82/scraping

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(u'//div[@id="atfResults"]//div[starts-with(@id,"result_")]')
        if products:
            meta = response.meta
            meta['next_results'] = []
            next_page = hxs.select(u'//a[@class="pagnNext"]/@href').extract()
            if next_page:
                next_page = urljoin_rfc(get_base_url(response), next_page[0])
                meta['next_page'] = next_page
            for product in products:
                url = product.select(u'.//a[@class="title"]/@href')
                if not url:
                    url = product.select(u'.//h3[@class="newaps"]/a/@href')
                if url:
                    url = url[0].extract()
                else:
                    continue
                url = urljoin_rfc(get_base_url(response), url)
                soup = BeautifulSoup(product.extract())
                price = soup.find('ul', attrs={'class': 'rsltL'})
                if price:
                    price = price.findAll('span')[0]
                if not price:
                    price = soup.find('span', 'price addon')
                if not price:
                    price = soup.find('span', 'price')
                if price:
                    price = price.string.strip()[1:]
                if not price:
                    price = '1000.00'
                meta['next_results'].append({'price': float(price), 'url': url})

            meta['next_results'].sort(key=lambda elem: elem.get('price'))
            meta['next_results'] = [elem['url'] for elem in meta['next_results']]
            first_url = meta['next_results'][0]
            meta['next_results'] = meta['next_results'][1:]
            yield Request(first_url, callback=self.parse_product, meta=meta, dont_filter=True)

示例#27

0

显示文件

文件： visiondirect.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        soup = BeautifulSoup(response.body)

        # product list page
        products = soup.findAll('a', {'class': 'products-list__item'})
        if products:
            for r in self.parse_category(response):
                yield r
            return
        # discontinued product
        discontinued = response.xpath(
            "//div[contains(@class, 'discontinued')]")
        if not discontinued:
            discontinued = 'Discontinued Product' in response.body
        if discontinued:
            return

        name = response.xpath("//h1[@itemprop='name']/text()").extract()
        if not name:
            name = soup.find('h1', {'itemprop': 'name'}).text
        price = re.findall(
            '"per_box_price_formated":"<span class=\\\\"price\\\\">\\\\u[\da-f]{4}([\d\.]*)<\\\\/span>",',
            response.body_as_unicode())[0]
        stock = None
        brand = response.xpath('//span[@itemprop="manufacturer"]/text()').re(
            'by&nbsp;(.*)')
        if not brand:
            brand = soup.find('span', {
                'itemprop': 'manufacturer'
            }).text.split('by&nbsp;')[-1].strip()
        sku = re.search('"sku":"([^"]*)","product_id"',
                        response.body_as_unicode()).group(1)
        identifier = re.search('"product_id":"([^"]*)"',
                               response.body_as_unicode()).group(1)
        image_url = response.xpath("//img[@class='prod-image']/@src").extract()
        if not image_url:
            image_url = soup.find('img', {'itemprop': 'image'})['src']
        cats = []
        for el in response.xpath("//ul[@class='gl3-breadcrumbs']/li")[1:-1]:
            cats.append(''.join(el.xpath('.//text()').extract()).strip())

        shipping_cost = '2.98' if float(price) < 49 else '0'

        loader = ProductLoaderWithNameStrip(Product(), response=response)

        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('stock', stock)
        loader.add_value('url', response.url)
        loader.add_value('brand', brand)
        loader.add_value('sku', sku)
        loader.add_value('identifier', identifier)
        loader.add_value('image_url', image_url)
        loader.add_value('category', cats)
        loader.add_value('shipping_cost', shipping_cost)

        yield loader.load_item()

示例#28

0

显示文件

文件： amazon_spider.py 项目： 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)

            soup = BeautifulSoup(product.extract())
            loader.add_value("name", soup.find("h3", attrs={"class": "newaps"}).findAll("span")[0].string)
            loader.add_value("url", soup.find("h3", attrs={"class": "newaps"}).findAll("a")[0]["href"])
            loader.add_value("price", soup.find("ul", attrs={"class": "rsltL"}).findAll("span")[0].string)
            # loader.add_value('sku', response.meta['sku'])
            # loader.add_value('identifier', response.meta['sku'])

            if (
                loader.get_output_value("price")
                and (pr is None or pr.get_output_value("price") > loader.get_output_value("price"))
                and valid_price(response.meta["price"], loader.get_output_value("price"))
            ):
                pr = loader

        if pr:
            yield pr.load_item()

示例#29

0

显示文件

文件： shoemetro_spider.py 项目： 0--key/lib

 def parse_items(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     cur_page = hxs.select('//span[@class="currentPage"]/text()').extract()
     if cur_page and (int(cur_page[0]) != response.meta["cur"]) and (response.meta["attempt"] < 5):
         log.msg("WRONG PAGE! ONE MORE ATTEMPT to " + response.url)
         yield Request(
             response.url + "&at=" + str(response.meta["attempt"]),
             meta={"cur": response.meta["cur"], "attempt": response.meta["attempt"] + 1},
             dont_filter=True,
             callback=self.parse_items,
         )
         return
     soup = BeautifulSoup(response.body)
     products = [
         a["href"]
         for a in soup.findAll(
             lambda tag: tag.name == "a" and tag.findChild("b") and tag.findParent("td", {"colspan": 2})
         )
     ]
     for url in products:
         url = urljoin_rfc(get_base_url(response), url)
         yield Request(url, callback=self.parse_product)
     """trs = hxs.select('//div[@id="mainContent"]//table[@style="height:100%"]/tr')

示例#30

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), response=response)

        # XPath does not work for some reason
        soup = BeautifulSoup(response.body)

        try:
            name = soup.find(attrs={'itemprop': 'name'}).text
        except:
            return

        loader.add_value(
            'identifier',
            soup.find('div', {
                'class': 'clearfix'
            }).find('a')['title'])
        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_value(
            'price',
            extract_price(
                soup.find(attrs={
                    'itemprop': 'price'
                }).text.replace('.', '').replace(',', '.')))

        try:
            loader.add_value('sku', re.search('(\d{4}\d*)', name).groups()[0])
        except:
            self.log('Product without SKU: %s' % (response.url))
        loader.add_value('category', 'Lego')

        img = soup.find(attrs={'itemprop': 'image'}).find('img')
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img['src']))

        loader.add_value('brand', 'lego')
        loader.add_value('shipping_cost', '1.99')
        #        loader.add_xpath('stock', '1')

        yield loader.load_item()

示例#31

0

显示文件

文件： visiondirect.py 项目： oceancloud82/scraping

 def parse(self, response):
     # using beautiful soup since the html is broken and cannot be parsed with lxml
     soup = BeautifulSoup(response.body)
     urls = soup.findAll('a', {'class': 'products-list__item'})
     for url in urls:
         yield Request(url['href'], callback=self.parse_product, meta=response.meta)

示例#32

0

显示文件

文件： argonaut_liquor.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES)
        # products = hxs.select(u'//div[@class="itemResultsRow"]')
        try:
            products = soup.findAll('div', attrs={'class': 'itemResultsRow'})
        except AttributeError:
            products = []
        for product in products:
            # url = product.select(u'.//div[@class="itemTitle"]/a/@href').extract()[0]
            url = product.find('div', attrs={'class': 'itemTitle'}).find('a')['href']
            url = urljoin_rfc(get_base_url(response), url)
            # dropdown = product.select(u'.//select[@name="mv_order_item"]')
            dropdown = product.find('select', attrs={'name': 'mv_order_item'})
            if not dropdown:
                try:
                    # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip()
                    brand = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'brand'}).text.strip()
                except AttributeError:
                    brand = u''
                # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip()
                title = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'title'}).text.strip()
                try:
                    # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip()
                    vintage_age = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'vintageAge'}).text.strip()
                except AttributeError:
                    vintage_age = u''
                # multiple_prices = product.select(u'.//td[@class="priceCell"]')
                multiple_prices = product.findAll('td', attrs={'class':'priceCell'})
                for option in multiple_prices:
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    # price = option.select(u'.//p[@class="priceCellP salePriceP"]/span[@class="priceRetail"]/text()')
                    try:
                        price = option.find('p', attrs={'class': 'priceCellP salePriceP'}).find('span', attrs={'class': 'priceSale'}).text.strip()
                    except AttributeError:
                        price = option.find('p', attrs={'class': 'priceCellP'}).find('span', attrs={'class': 'priceRetail'}).text.strip()
                    # if not price:
                        # price = option.select(u'.//p[@class="priceCellP"]/span[@class="priceSale"]/text()')
                    # price = price[0].extract().strip()

                    # bottle_size = option.select(u'.//p[@class="priceCellP priceUnit"]/text()').extract()
                    bottle_size = option.find('p', attrs={'class': 'priceCellP priceUnit'})

                    if not bottle_size:
                        # bottle_size = option.select(u'.//p[@class="priceCellP"]/span[@class="priceUnit"]/text()').extract()
                        bottle_size = option.find(lambda tag: tag.name == 'span' and tag.get('class', '') == 'priceUnit' and tag.findParent('p', attrs={'class': 'priceCellP'}))
                    if bottle_size:
                        name += u' %s' % bottle_size.text.strip()
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    if loader.get_output_value('price'):
                        yield loader.load_item()
            else:
                # dropdown = dropdown[0]
                # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip()
                # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip()
                brand = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'brand'}).text.strip()
                title = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'title'}).text.strip()
                # for option in dropdown.select(u'./option/text()').extract():
                for option in [option.text for option in dropdown.findAll('option')]:
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('url', url)
                    name = u'%s %s' % (brand, title)
                    option = re.search(u'(.*?) \((.*)\)', option).groups()
                    name += u' %s' % option[1]
                    loader.add_value('name', name)
                    loader.add_value('price', option[0])
                    if loader.get_output_value('price'):
                        yield loader.load_item()

示例#33

0

显示文件

文件： argonaut_liquor.py 项目： ontiyonke/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        soup = BeautifulSoup(response.body,
                             convertEntities=BeautifulSoup.HTML_ENTITIES)
        # products = hxs.select(u'//div[@class="itemResultsRow"]')
        try:
            products = soup.findAll('div', attrs={'class': 'itemResultsRow'})
        except AttributeError:
            products = []
        for product in products:
            # url = product.select(u'.//div[@class="itemTitle"]/a/@href').extract()[0]
            url = product.find('div', attrs={
                'class': 'itemTitle'
            }).find('a')['href']
            url = urljoin_rfc(get_base_url(response), url)
            # dropdown = product.select(u'.//select[@name="mv_order_item"]')
            dropdown = product.find('select', attrs={'name': 'mv_order_item'})
            if not dropdown:
                try:
                    # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip()
                    brand = product.find('div', attrs={
                        'class': 'itemTitle'
                    }).find('a').find('span', attrs={
                        'class': 'brand'
                    }).text.strip()
                except AttributeError:
                    brand = u''
                # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip()
                title = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'title'
                }).text.strip()
                try:
                    # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip()
                    vintage_age = product.find('div',
                                               attrs={
                                                   'class': 'itemTitle'
                                               }).find('a').find(
                                                   'span',
                                                   attrs={
                                                       'class': 'vintageAge'
                                                   }).text.strip()
                except AttributeError:
                    vintage_age = u''
                # multiple_prices = product.select(u'.//td[@class="priceCell"]')
                multiple_prices = product.findAll('td',
                                                  attrs={'class': 'priceCell'})
                for option in multiple_prices:
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    # price = option.select(u'.//p[@class="priceCellP salePriceP"]/span[@class="priceRetail"]/text()')
                    try:
                        price = option.find('p',
                                            attrs={
                                                'class':
                                                'priceCellP salePriceP'
                                            }).find('span',
                                                    attrs={
                                                        'class': 'priceSale'
                                                    }).text.strip()
                    except AttributeError:
                        price = option.find('p', attrs={
                            'class': 'priceCellP'
                        }).find('span', attrs={
                            'class': 'priceRetail'
                        }).text.strip()
                    # if not price:
                    # price = option.select(u'.//p[@class="priceCellP"]/span[@class="priceSale"]/text()')
                    # price = price[0].extract().strip()

                    # bottle_size = option.select(u'.//p[@class="priceCellP priceUnit"]/text()').extract()
                    bottle_size = option.find(
                        'p', attrs={'class': 'priceCellP priceUnit'})

                    if not bottle_size:
                        # bottle_size = option.select(u'.//p[@class="priceCellP"]/span[@class="priceUnit"]/text()').extract()
                        bottle_size = option.find(
                            lambda tag: tag.name == 'span' and tag.get(
                                'class', '') == 'priceUnit' and tag.findParent(
                                    'p', attrs={'class': 'priceCellP'}))
                    if bottle_size:
                        name += u' %s' % bottle_size.text.strip()
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    if loader.get_output_value('price'):
                        yield loader.load_item()
            else:
                # dropdown = dropdown[0]
                # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip()
                # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip()
                brand = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'brand'
                }).text.strip()
                title = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'title'
                }).text.strip()
                # for option in dropdown.select(u'./option/text()').extract():
                for option in [
                        option.text for option in dropdown.findAll('option')
                ]:
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('url', url)
                    name = u'%s %s' % (brand, title)
                    option = re.search(u'(.*?) \((.*)\)', option).groups()
                    name += u' %s' % option[1]
                    loader.add_value('name', name)
                    loader.add_value('price', option[0])
                    if loader.get_output_value('price'):
                        yield loader.load_item()

示例#34

0

显示文件

文件： argonautliquor_spider.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        if response.status == 404 or response.status == 302:
            search_url = 'http://www.argonautliquor.com/results?term=' + response.url.split('products/')[-1]
            yield Request(search_url, callback=self.parse_product)
            return

        soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES)
        url = response.url
            
        image_url = soup.find('meta', attrs={'property': 'og:image'})
        image_url = image_url.get('content') if image_url and image_url.get('content') != 'http:' else ''

        try:
            brand = soup.find('h1', attrs={'class': 'itemTitle'}).find('span', attrs={'class': 'brand'}).text.strip()
        except AttributeError:
            brand = u''
        title = soup.find('h1', attrs={'class': 'itemTitle'}).find('span', attrs={'class': 'title'}).text.strip()
        try:
            vintage_age = soup.find('h1', attrs={'class': 'itemTitle'}).find('span', attrs={'class': 'vintageAge'}).text.strip()
        except AttributeError:
            vintage_age = u''


        dropdown = soup.find('select', attrs={'name': 'mv_order_item'})
        if not dropdown:
            multiple_prices = soup.find('div', attrs={'class': 'priceArea'}).findAll('td', attrs={'class': 'priceCell'})
            for option in multiple_prices:
                name = u'%s %s %s' % (brand, title, vintage_age)
                loader = ProductLoader(item=Product(), selector=option)
                loader.add_value('url', url)
                try:
                    price = option.find('p', attrs={'class': 'priceCellP salePriceP'}).find('span', attrs={'class': 'priceSale'}).text.strip()
                except AttributeError:
                    price = option.find('p', attrs={'class': 'priceCellP'}).find('span', attrs={'class': 'priceRetail'}).text.strip()
                try:
                    sku = option.find('p', attrs={'class': 'priceCellP itemid'}).text.strip()
                except AttributeError:
                    try:
                        sku = option.find('p', attrs={'class': 'priceCellP sku'}).text.strip()
                    except AttributeError:
                        try:
                            sku = option.find('p', attrs={'class': 'sku'}).text.strip()
                        except AttributeError:
                            try:
                                sku = option.find('span', attrs={'class': 'sku'}).text.strip()
                            except AttributeError:
                                sku = ''
                sku = sku.replace('SKU', '').strip()
                bottle_size = option.find('p', attrs={'class': 'priceCellP priceUnit'})

                if not bottle_size:
                    bottle_size = option.find(lambda tag: tag.name == 'span' and tag.get('class', '') == 'priceUnit' and tag.findParent('p', attrs={'class': 'priceCellP'}))
                if bottle_size:
                    name += u' %s' % bottle_size.text.strip()
                loader.add_value('name', name)
                loader.add_value('price', price)
                loader.add_value('brand', brand)
                loader.add_value('sku', sku)
                loader.add_value('identifier', sku)
                loader.add_value('image_url', image_url)
                if loader.get_output_value('price'):
                    yield loader.load_item()
        else:
            for option in dropdown.findAll('option'):
                name = u'%s %s %s' % (brand, title, vintage_age)
                option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$', option.text).groups()
                price = option[0]
                name += u' %s' % option[1].strip()
                sku = option[2].replace('SKU', '').strip()
                loader = ProductLoader(item=Product(), selector=option)
                loader.add_value('url', url)
                loader.add_value('name', name)
                loader.add_value('price', price)
                loader.add_value('brand', brand)
                loader.add_value('sku', sku)
                loader.add_value('image_url', image_url)
                loader.add_value('identifier', sku)
                if loader.get_output_value('price'):
                    yield loader.load_item()

示例#35

0

显示文件

文件： amazon_spider_us.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        option_label = ' '.join(
            hxs.select('//div[@class="variationSelected"]'
                       '/*[@class="variationLabel"]/text()').extract())

        loader = ProductLoader(item=Product(), selector=hxs)
        soup = BeautifulSoup(response.body)
        try:
            name = ' '.join(
                [soup.find('span', id='btAsinTitle').text,
                 option_label]).strip()
        except:
            name = ' '.join([
                hxs.select('//h1[@id="title"]/text()').extract()[0].strip(),
                option_label
            ]).strip()
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        no_price_ = False
        try:
            soup_form = soup.find(id='handleBuy')
            price = soup_form.find('b', 'priceLarge')
            if not price:
                price = soup_form.find('span', 'price')
            if not price:
                price = soup_form.find('span', 'pa_price')
            if not price:
                no_price_ = True
            else:
                loader.add_value('price', price.text)
        except:
            price = hxs.select('//div[@id="price"]//td[text()="Price:"]'
                               '/following-sibling::td/span/text()').extract()
            if not price:
                no_price_ = True
            else:
                loader.add_value('price', price[0])

        if no_price_:
            self.log('ERROR: no price found! URL:{}'.format(response.url))
            return

        reviews_url = hxs.select(
            u'//a[contains(text(),"customer review") and contains(@href, "product-reviews") '
            u'and not(contains(@href, "create-review"))]/@href').extract()
        loader.add_value('brand', response.meta['brand'].strip().lower())

        sku = hxs.select(
            '//span[@class="tsLabel" and contains(text(), "Part Number")]/../span[2]/text()'
        ).extract()
        if not sku:
            sku = hxs.select(
                '//b[contains(text(), "model number")]/../text()').extract()
        if sku:
            loader.add_value('sku', sku[0].strip().lower())
        else:
            self.log('ERROR: no SKU found! URL:{}'.format(response.url))

        identifier = hxs.select('//form/input[@name="ASIN"]/@value').extract()
        if not identifier:
            self.log('ERROR: no identifier found! URL:{}'.format(response.url))
            return
        else:
            loader.add_value('identifier', identifier)

        product_image = hxs.select(
            '//*[@id="main-image" or @id="prodImage"]/@src').extract()
        if not product_image:
            self.log('ERROR: no product Image found!')
        else:
            image = urljoin_rfc(get_base_url(response),
                                product_image[0].strip())
            loader.add_value('image_url', image)

        category = hxs.select('//*[@id="nav-subnav"]/li[1]/a/text()').extract()
        if not category:
            self.log("ERROR: category not found")
        else:
            loader.add_value('category', category[0].strip())

        product = loader.load_item()

        if product['identifier'] not in self.ids:
            self.ids.append(product['identifier'])

            metadata = KeterMeta()

            metadata['brand'] = response.meta['brand'].strip().lower()
            metadata['reviews'] = []
            product['metadata'] = metadata

            if reviews_url:
                yield Request(urljoin_rfc(base_url, reviews_url[0]),
                              meta={'product': product},
                              callback=self.parse_review)
            else:
                yield product

示例#36

0

显示文件

文件： argonaut_liquor.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        soup = BeautifulSoup(response.body,
                             convertEntities=BeautifulSoup.HTML_ENTITIES)
        try:
            products = soup.findAll('div', attrs={'class': 'itemResultsRow'})
        except AttributeError:
            products = []
        if not products:
            single_product = True
        else:
            single_product = False

        for product in products:
            url = product.find('div', attrs={
                'class': 'itemTitle'
            }).find('a')['href']
            url = urljoin_rfc(get_base_url(response), url)

            try:
                brand = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'brand'
                }).text.strip()
            except AttributeError:
                brand = u''
            title = product.find('div', attrs={
                'class': 'itemTitle'
            }).find('a').find('span', attrs={
                'class': 'title'
            }).text.strip()
            try:
                # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip()
                vintage_age = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'vintageAge'
                }).text.strip()
            except AttributeError:
                vintage_age = u''

            dropdown = product.find('select', attrs={'name': 'mv_order_item'})
            if not dropdown:
                multiple_prices = product.findAll('td',
                                                  attrs={'class': 'priceCell'})
                for option in multiple_prices:
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    try:
                        price = option.find('p',
                                            attrs={
                                                'class':
                                                'priceCellP salePriceP'
                                            }).find('span',
                                                    attrs={
                                                        'class': 'priceSale'
                                                    }).text.strip()
                    except AttributeError:
                        price = option.find('p', attrs={
                            'class': 'priceCellP'
                        }).find('span', attrs={
                            'class': 'priceRetail'
                        }).text.strip()
                    try:
                        sku = option.find('p',
                                          attrs={
                                              'class': 'priceCellP itemid'
                                          }).text.strip()
                    except AttributeError:
                        sku = ''
                    bottle_size = option.find(
                        'p', attrs={'class': 'priceCellP priceUnit'})
                    if not bottle_size:
                        bottle_size = option.find(
                            lambda tag: tag.name == 'span' and tag.get(
                                'class', '') == 'priceUnit' and tag.findParent(
                                    'p', attrs={'class': 'priceCellP'}))
                    if bottle_size:
                        name += u' %s' % bottle_size.text.strip()
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    loader.add_value('sku', sku)
                    if loader.get_output_value('price'):
                        yield loader.load_item()
            else:
                for option in dropdown.findAll('option'):
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('url', url)
                    name = u'%s %s' % (brand, title)
                    # option = re.search(u'(.*?) \((.*)\)', option.text).groups()
                    option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$',
                                       option.text).groups()
                    name += u' %s' % option[1]
                    loader.add_value('name', name)
                    loader.add_value('price', option[0])
                    loader.add_value('sku', option[2])
                    if loader.get_output_value('price'):
                        yield loader.load_item()

        if single_product:
            url = response.url
            try:
                brand = soup.find('div', attrs={
                    'class': 'itemTitle'
                }).find('span', attrs={
                    'class': 'brand'
                }).text.strip()
            except AttributeError:
                brand = u''
            title = soup.find('div', attrs={
                'class': 'itemTitle'
            }).find('span', attrs={
                'class': 'title'
            }).text.strip()
            try:
                vintage_age = soup.find('div', attrs={
                    'class': 'itemTitle'
                }).find('span', attrs={
                    'class': 'vintageAge'
                }).text.strip()
            except AttributeError:
                vintage_age = u''
            dropdown = soup.find('select', attrs={'name': 'mv_order_item'})
            if not dropdown:
                multiple_prices = soup.find('div',
                                            attrs={
                                                'class': 'priceArea'
                                            }).findAll(
                                                'td',
                                                attrs={'class': 'priceCell'})
                for option in multiple_prices:
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    try:
                        price = option.find('p',
                                            attrs={
                                                'class':
                                                'priceCellP salePriceP'
                                            }).find('span',
                                                    attrs={
                                                        'class': 'priceSale'
                                                    }).text.strip()
                    except AttributeError:
                        price = option.find('p', attrs={
                            'class': 'priceCellP'
                        }).find('span', attrs={
                            'class': 'priceRetail'
                        }).text.strip()
                    try:
                        sku = option.find('p',
                                          attrs={
                                              'class': 'priceCellP itemid'
                                          }).text.strip()
                    except AttributeError:
                        sku = ''
                    bottle_size = option.find(
                        'p', attrs={'class': 'priceCellP priceUnit'})

                    if not bottle_size:
                        bottle_size = option.find(
                            lambda tag: tag.name == 'span' and tag.get(
                                'class', '') == 'priceUnit' and tag.findParent(
                                    'p', attrs={'class': 'priceCellP'}))
                    if bottle_size:
                        name += u' %s' % bottle_size.text.strip()
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    loader.add_value('sku', sku)
                    if loader.get_output_value('price'):
                        yield loader.load_item()
            else:
                for option in dropdown.findAll('option'):
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$',
                                       option.text).groups()
                    price = option[0]
                    name += u' %s' % option[1].strip()
                    sku = option[2]

                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    loader.add_value('sku', sku)
                    if loader.get_output_value('price'):
                        yield loader.load_item()

示例#37

0

显示文件

文件： uktoolcentre.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        try:
            # fall back to Beautiful Soup
            soup = BeautifulSoup(response.body)
            hxs = HtmlXPathSelector(response)

            container = soup.find('div', attrs={'class': 'nosto_product'})

            brand = container.find('span', attrs={'class': 'brand'}).text
            cat_names = [el.text for el in soup.find("div", id='bct').findAll('a')][1:]
            main_id = container.find('span', attrs={'class': 'product_id'}).text
            availability = container.find('span', attrs={'class': 'availability'}).text
            image_url = soup.find('img', id='main-image').attrMap['src']

            options = soup.find('table', id='sku-table')
            if not options:
                name = soup.find('div', id='product-page-info').find('h1').text
                price = container.find('span', attrs={'class': 'price'}).text

                loader = ProductLoaderWithNameStrip(Product(), selector=hxs)
                loader.add_value('brand', brand)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('name', name)
                loader.add_value('identifier', main_id)
                loader.add_value('price', price)
                loader.add_value('url', response.url)
                loader.add_value('sku', main_id)
                loader.add_value('image_url', image_url)

                if availability.lower() == 'outofstock':
                    loader.add_value('stock', 0)

                yield loader.load_item()
            else:
                option_ids = []
                for opt in options.findAll('tr'):
                    sec_id = opt.findAll('td')[1].find('small').text
                    name = opt.findAll('td')[1].text.replace(sec_id, '')
                    sec_id = sec_id.strip('(').strip(')')
                    identifier = main_id + ':' + sec_id
                    volts = get_volts_from_name(name)
                    if volts is not None:
                        identifier = identifier + ':' + volts
                    pack_of = get_pack_of_from_name(name)
                    if pack_of is not None:
                        identifier = identifier + ':' + pack_of

                    if identifier in option_ids:
                        option_id = opt.find('input', attrs={'name': 'ID'}).get('value')
                        identifier = identifier + ':' + option_id

                    option_ids.append(identifier)

                    price = opt.find('td', attrs={'class': 'price'}).text.strip(u'\xa3').strip('&pound;')

                    loader = ProductLoaderWithNameStrip(Product(), response=response)
                    loader.add_value('brand', brand)
                    for cat_name in cat_names:
                        loader.add_value('category', cat_name)
                    loader.add_value('name', name)
                    loader.add_value('identifier', identifier)
                    loader.add_value('price', price)
                    loader.add_value('url', response.url)
                    loader.add_value('sku', main_id)
                    loader.add_value('image_url', image_url)

                    if availability.lower() == 'outofstock':
                        loader.add_value('stock', 0)
                    
                    yield loader.load_item()

        except IndexError as e:
            # try loading page again
            tries = response.meta.get('try', 0)
            if tries < 10:
                yield Request(response.url, callback=self.parse_product, dont_filter=True, meta={'try': tries + 1})
            else:
                self.errors.append("Error scraping page %s: %s" % (response.url, str(e)))
                raise

示例#38

0

显示文件

    def parse_product(self, response):
        soup = BeautifulSoup(response.body)
        if not soup.find('div', attrs={'class': 'product'}):
            retry_request = _retry_page(response)
            if retry_request:
                yield retry_request
            else:
                self.log(
                    "Error parsing page, couldn't extract product name: %s" %
                    response.url)
            return
        main_name = soup.find('div', attrs={'class': 'product'}).h1.text
        main_name = remove_entities(main_name)
        brand_el = soup.find(
            lambda tag: tag.name == 'td' and 'brand' in tag.text.lower())
        brand = brand_el.findNextSibling('td').text.strip() if brand_el else ''
        cat_names = [
            span.a.text
            for span in soup.find('div', attrs={
                'class': 'breadcrumbtrail'
            }).span.findAll('span') if span.a
        ][2:]
        image_url = soup.find('img', {'itemprop': 'image'})
        image_url = image_url['src'] if image_url else None

        table = soup.find('table', id='responsive-table')
        options = soup.findAll('div', attrs={'class': 'option'})
        if table:
            for row in table.findAll('tr'):
                # Skip head row
                if not row.td:
                    continue

                name = row.find('span', attrs={'class': 'name'}).text
                name = remove_entities(name)
                if not _main_name_in_opt_name(main_name, name):
                    name = main_name + ' ' + name
                identifier = row.find('span', attrs={'class': 'codenumber'})
                if not identifier:
                    self.errors.append(
                        "Identifier not found for products on page: %s" %
                        response.url)
                    continue
                identifier = identifier.text

                price = row.find(_is_price_tag).text
                real_price = extract_price(price)
                if real_price < 15:
                    shipping_cost = 3
                elif real_price < 40:
                    shipping_cost = 4
                elif real_price < 130:
                    shipping_cost = 7
                else:
                    shipping_cost = None

                loader = ProductLoaderWithNameStrip(Product(),
                                                    response=response)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('brand', brand)
                loader.add_value('identifier', identifier)
                loader.add_value('sku', identifier)
                loader.add_value('price', price)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('shipping_cost', shipping_cost)
                loader.add_value('image_url', image_url)

                yield loader.load_item()
        elif options:
            main_id = response.url.split('.')[-2].split('p-')[-1]
            price = soup.find('span', attrs={'class': 'inctax'}).span.text
            real_price = extract_price(price)
            if real_price < 15:
                shipping_cost = 3
            elif real_price < 40:
                shipping_cost = 4
            elif real_price < 130:
                shipping_cost = 7
            else:
                shipping_cost = None

            results = {}
            for opt in options:
                opt_name = opt.label.span.text
                results[opt_name] = []
                for subopt in opt.select.findAll('option'):
                    subopt_name = subopt.text
                    subopt_value = _soup_el_get_attr(subopt, 'value')
                    if subopt_value == '0':
                        continue
                    results[opt_name].append({
                        'id':
                        remove_entities(subopt_name).replace('"', ''),
                        'name':
                        opt_name + ': ' + subopt_name
                    })
            for opt_tuple in product(*results.values()):
                name = _build_opt_name(main_name, opt_tuple)
                identifier = _build_opt_id(main_id, opt_tuple)
                loader = ProductLoaderWithNameStrip(Product(),
                                                    response=response)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('brand', brand)
                loader.add_value('identifier', identifier)
                loader.add_value('sku', identifier)
                loader.add_value('price', price)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('shipping_cost', shipping_cost)
                loader.add_value('image_url', image_url)

                yield loader.load_item()

示例#39

0

显示文件

    def parse_product(self, response):
        hxs = HtmlXPathSelector(text=response.body_as_unicode())

        loader = ProductLoader(response=response, item=Product())

        loader.add_value('url', response.url)
        identifier = hxs.select('//input[@id="catentryId"]/@value').extract()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')

        price = ''.join(
            hxs.select('//div[@itemprop="price"]//span[@class="price"]//text()'
                       ).extract()).strip()
        loader.add_value('price', price)

        categories = hxs.select(
            '//ul[@class="breadcrumbs"]//li[not(@class="home")]/a/span/text()'
        ).extract()[1:]
        loader.add_value('category', categories)

        image_url = hxs.select('//img[@id="productMainImage"]/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), image_url[0]))

        brand = hxs.select(
            '//li[contains(text(), "BRAND")]/span/text()').extract()
        loader.add_value('brand', brand)

        item = loader.load_item()

        if not item.get('name'):
            log.msg('Using BeautifulSoup: ' + response.url)
            loader = ProductLoader(response=response, item=Product())
            soup = BeautifulSoup(response.body)

            loader.add_value('url', response.url)
            identifier = soup.find('input', attrs={'id': 'catentryId'})
            identifier = _soup_el_get_attr(identifier, 'value')
            loader.add_value('identifier', identifier)
            loader.add_value('sku', identifier)
            name = soup.find('h1', attrs={'itemprop': 'name'}).text
            loader.add_value('name', name)
            categories = [
                li.a.span.text
                for li in soup.find('ul', attrs={
                    'class': 'breadcrumbs'
                }).findAll('li') if li.a
            ][2:]
            loader.add_value('category', categories)
            price = soup.find('div', attrs={
                'itemprop': 'price'
            }).find('span', attrs={
                'class': 'price'
            }).text
            loader.add_value('price', price)

            image_url = soup.find('img', attrs={'id': 'productMainImage'})
            if image_url:
                image_url = _soup_el_get_attr(image_url, 'src')
                loader.add_value(
                    'image_url', urljoin_rfc(get_base_url(response),
                                             image_url))

            brand = ''
            for li in soup.findAll('li'):
                if 'BRAND' in li.text.upper():
                    brand = li.span.text
                    break

            loader.add_value('brand', brand)
            item = loader.load_item()
            if item['identifier']:
                yield item
        else:
            if item['identifier']:
                yield item

        if not item.get('name'):
            request = self.retry(response,
                                 "No name for product: " + response.url)
            if request:
                yield request
            return