Python format_html_string 예제들, crawling.utils.format_html_string Python 예제들

예제 #1

0

파일 보기

    def parse_item(self, response):
        self._logger.info('JomashopSpider#parse_item...')
        item = JomashopItem()
        sel = Selector(response)
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)
        item['shipping_availability'] = format_html_string(''.join(
            sel.xpath(
                '//*[@id="product_addtocart_form"]//li[@class="pdp-shipping-availability"]/span/text()'
            ).extract()))
        MagicToolboxContainer_string = ''.join(
            sel.xpath(
                '//div[@class="MagicToolboxContainer "]//span[@style="margin-top:8px;"]/text()'
            ).extract())
        item['image_urls'] = re.findall(r'data-href="(.*?)"',
                                        MagicToolboxContainer_string,
                                        re.MULTILINE | re.DOTALL)
        item['details'] = format_html_string(''.join(
            sel.xpath('//dd[@id="tab-container-details"]').extract()))
        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid'])

        return item

예제 #2

0

파일 보기

파일: drugstore_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item(self, response):
        print("FinishlineSpider#parse_item ...")
        self._logger.debug("FinishlineSpider#parse_item ...")
        item = DrugstoreItem()
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)
        sel = Selector(response)
        item['title'] = ' '.join(
            sel.xpath('//*[@id="divCaption"]/h1//text()').extract()).strip()
        item['product_details'] = format_html_string(''.join(
            sel.xpath('//*[@id="divPromosPDetail"]').extract()).strip())
        ingredients = ''.join(
            sel.xpath('//*[@id="TblProdForkFactsCntr"]').extract()).strip()
        if len(ingredients) == 0:
            ingredients = ''.join(
                sel.xpath(
                    '//*[@id="TblProdForkIngredients"]').extract()).strip()

        item['ingredients'] = format_html_string(ingredients)
        s = ''.join(
            sel.xpath('//*[@id="largeProdImageLink"]/a/@href').extract())
        relative_image_url = re_search(r"popUp\(\'(.*?)\'", s)
        full_image_url = urljoin(response.url, relative_image_url)

        image_urls = []
        while 1:
            request = urllib2.Request(full_image_url)
            response_image = urllib2.urlopen(request)
            image_html_str = response_image.read()
            node = lxml.html.fromstring(image_html_str)
            image_url = ''.join(node.xpath('//*[@id="productImage"]/img/@src'))
            image_urls.append(image_url)
            no_next_image = node.xpath(
                '//img[contains(@src,"right_arrow_grey.gif") and @alt="no image"]'
            )
            if no_next_image:
                break
            else:
                full_image_url = urljoin(
                    full_image_url, ''.join(
                        node.xpath('//img[@alt="see next image"]/../@href')))
                if not full_image_url:
                    break

        item['image_urls'] = image_urls
        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid'])

        return item

예제 #3

0

파일 보기

 def parse_chinese_detail(self, response):
     #self.log('AshfordSpider#parse_chinese_detail...')
     self._logger.info('AshfordSpider#parse_chinese_detail...')
     sel = Selector(response)
     item = response.meta['item_half']
     item['chinese_detail'] = format_html_string(''.join(sel.xpath('//div[@id="tab1_info"]').extract()).strip())
     return item

예제 #4

0

파일 보기

    def parse_item(self, response):
        #self.log('AshfordSpider#parse_item...')
        self._logger.info('AshfordSpider#parse_item...')
        item = AshfordItem()
        sel = Selector(response)
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)
        item['prodName'] = ''.join(sel.xpath(' //*[@id="prodName"]/a/text()').extract()).strip()
        item['prod_desc'] = (''.join(sel.xpath('//*[@id="fstCont"]/h3/text()').extract()).strip())
        item['detail'] = format_html_string(''.join(sel.xpath('//div[@id="tab1_info"]').extract()).strip())
        item['Brand'] = ''.join(sel.xpath('//h1[@id="prodName"]/a[@id="sameBrandProduct"]/text()[1]').extract()).strip()
        item['product_images'] = list(set(sel.xpath('//a[contains(@href,"/images/catalog/") and contains(@href,"XA.jpg")]/@href').extract()))
        item['image_urls'] = [urljoin(response.url, i) for i in item['product_images']]
        chinese_url = response.url.replace('www.', 'zh.')

        response.meta['item_half'] = item
        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )

        yield Request(
            url=chinese_url,
            meta=response.meta,
            callback=self.parse_chinese_detail,
            dont_filter=True
            )

예제 #5

0

파일 보기

파일: jomashop_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item(self, response):
        self._logger.info('JomashopSpider#parse_item...')
        item = JomashopItem()
        sel = Selector(response)
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)
        item['shipping_availability'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//li[@class="pdp-shipping-availability"]/span/text()').extract()))
        MagicToolboxContainer_string = ''.join(sel.xpath('//div[@class="MagicToolboxContainer "]//span[@style="margin-top:8px;"]/text()').extract())
        item['image_urls'] = re.findall(r'data-href="(.*?)"', MagicToolboxContainer_string, re.MULTILINE | re.DOTALL)
        item['details'] = format_html_string(''.join(sel.xpath('//dd[@id="tab-container-details"]').extract()))
        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )

        return item

예제 #6

0

파일 보기

파일: drugstore_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item(self, response):
        print("FinishlineSpider#parse_item ...")
        self._logger.debug("FinishlineSpider#parse_item ...")
        item = DrugstoreItem()
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)
        sel = Selector(response)
        item['title'] = ' '.join(sel.xpath('//*[@id="divCaption"]/h1//text()').extract()).strip()
        item['product_details'] = format_html_string(''.join(sel.xpath('//*[@id="divPromosPDetail"]').extract()).strip())
        ingredients = ''.join(sel.xpath('//*[@id="TblProdForkFactsCntr"]').extract()).strip()
        if len(ingredients) == 0:
            ingredients = ''.join(sel.xpath('//*[@id="TblProdForkIngredients"]').extract()).strip()

        item['ingredients'] = format_html_string(ingredients)
        s = ''.join(sel.xpath('//*[@id="largeProdImageLink"]/a/@href').extract())
        relative_image_url = re_search(r"popUp\(\'(.*?)\'", s)
        full_image_url = urljoin(response.url, relative_image_url)

        image_urls = []
        while 1:
            request = urllib2.Request(full_image_url)
            response_image = urllib2.urlopen(request)
            image_html_str = response_image.read()
            node = lxml.html.fromstring(image_html_str)
            image_url = ''.join(node.xpath('//*[@id="productImage"]/img/@src'))
            image_urls.append(image_url)
            no_next_image = node.xpath('//img[contains(@src,"right_arrow_grey.gif") and @alt="no image"]')
            if no_next_image:
                break
            else:
                full_image_url = urljoin(
                    full_image_url,
                    ''.join(node.xpath('//img[@alt="see next image"]/../@href'))
                )
                if not full_image_url:
                    break

        item['image_urls'] = image_urls
        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )

        return item

예제 #7

0

파일 보기

 def _enrich_same_part(self, item, response):
     sel = Selector(response)
     item['brand_name'] = format_html_string(''.join(
         sel.xpath(
             '//form[@id="product_addtocart_form"]//span[@class="brand-name"]/text()'
         ).extract()))
     item['product_name'] = format_html_string(''.join(
         sel.xpath(
             '//form[@id="product_addtocart_form"]//span[@class="product-name"]/text()'
         ).extract()))
     item['product_ids'] = format_html_string(''.join(
         sel.xpath(
             '//form[@id="product_addtocart_form"]//span[@class="product-ids"]/text()'
         ).extract()))
     item['final_price'] = format_html_string(''.join(
         sel.xpath(
             '//*[@id="product_addtocart_form"]//p[@class="final-price"]/meta[@itemprop="price"]/@content'
         ).extract()))
     item['retail_price'] = format_html_string(''.join(
         sel.xpath(
             '//*[@id="product_addtocart_form"]//li[@class="pdp-retail-price"]/span/text()'
         ).extract()))
     item['savings'] = format_html_string(''.join(
         sel.xpath(
             '//*[@id="product_addtocart_form"]//li[@class="pdp-savings"]/span/text()'
         ).extract()))
     item['shipping'] = format_html_string(''.join(
         sel.xpath(
             '//*[@id="product_addtocart_form"]//li[@class="pdp-shipping"]/span/text()'
         ).extract()))

예제 #8

0

파일 보기

    def _enrich_same_part(self, item, response):
        sel = Selector(response)
        item['title'] = ' '.join(sel.xpath('//*[@id="prdImage"]/h1/*//text()').extract()).strip()
        if len(item['title']) < 2:
            item['title'] = ' '.join(sel.xpath('//*[@id="productStage"]/h1/*/text()').extract()).strip()

        item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="prdInfoText"]').extract()).strip())
        if len(item['productDescription']) == 0:
            item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]').extract()).strip())

        item['stockJSON'] = json.loads(''.join(sel.re(r'var stockJSON =(.*);')).strip().replace('&nbsp;', ''))
        item['dimensions'] = json.loads(''.join(sel.re(r'var dimensions =(.*);')).strip().replace('&nbsp;', ''))
        item['dimToUnitToValJSON'] = json.loads(''.join(sel.re(r'var dimToUnitToValJSON =(.*);')).strip().replace('&nbsp;', ''))
        item['dimensionIdToNameJson'] = json.loads(''.join(sel.re(r'var dimensionIdToNameJson =(.*);')).strip().replace('&nbsp;', ''))
        item['valueIdToNameJSON'] = json.loads(''.join(sel.re(r'var valueIdToNameJSON =(.*);')).strip().replace('&nbsp;', ''))
        item['colorNames'] = json.loads(re_search(r'var colorNames =(.*?);', response.body))
        item['colorPrices'] = json.loads(re_search(r'var colorPrices =(.*?);', response.body))
        item['styleIds'] = json.loads(re_search(r'var styleIds =(.*?);', response.body))
        item['colorIds'] = json.loads(re_search(r'var colorIds =(.*?);', response.body))

예제 #9

0

파일 보기

파일: jomashop_spider.py 프로젝트: mtaziz/jaycluster

 def _enrich_same_part(self, item, response):
     sel = Selector(response)
     item['brand_name'] = format_html_string(''.join(sel.xpath('//form[@id="product_addtocart_form"]//span[@class="brand-name"]/text()').extract()))
     item['product_name'] = format_html_string(''.join(sel.xpath('//form[@id="product_addtocart_form"]//span[@class="product-name"]/text()').extract()))
     item['product_ids'] = format_html_string(''.join(sel.xpath('//form[@id="product_addtocart_form"]//span[@class="product-ids"]/text()').extract()))
     item['final_price'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//p[@class="final-price"]/meta[@itemprop="price"]/@content').extract()))
     item['retail_price'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//li[@class="pdp-retail-price"]/span/text()').extract()))
     item['savings'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//li[@class="pdp-savings"]/span/text()').extract()))
     item['shipping'] = format_html_string(''.join(sel.xpath('//*[@id="product_addtocart_form"]//li[@class="pdp-shipping"]/span/text()').extract()))

예제 #10

0

파일 보기

파일: finishline_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item(self, response):
        print("FinishlineSpider#parse_item ...")
        self._logger.debug("FinishlineSpider#parse_item ...")
        sel = Selector(response)
        item = FinishlineItem()
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)

        item['title'] = ''.join(
            sel.xpath('//h1[@id="title"]/text()').extract()).strip()
        list_size = []
        sizes = sel.xpath('//div[@id="productSizes"]/div[@class="size"]')
        for size in sizes:
            list_size.append([
                ''.join(size.xpath('@id').extract()),
                ''.join(size.xpath('text()').extract())
            ])
        item['size'] = list_size
        item['productDescription'] = format_html_string(''.join(
            sel.xpath('//div[@id="productDescription"]').extract()))
        item['product_images'] = json.loads(''.join(
            sel.re(r"JSON.parse\(\'(.*?)\'")).strip())
        item['links'] = ''.join(sel.re(r"links: \'(.*?)\'")).split(';')
        item['product_color'] = ''.join(
            sel.re(r'"product_color" : \["(.*?)\"'))
        item['style_color_ids'] = ''.join(
            sel.xpath(
                '//div[@id="styleColors"]/span[@class="styleColorIds"]/text()'
            ).extract())

        colorid = ''.join(
            sel.xpath('//h1[@id="title"]/@data-colorid').extract())

        styleid = ''.join(
            sel.xpath('//h1[@id="title"]/@data-styleid').extract())

        imageset_url = 'http://www.finishline.com/store/api/scene7/imageset/?colorId=%s&styleId=%s' % (
            colorid, styleid)

        meta = response.meta
        meta['item-half'] = item
        req = Request(url=imageset_url,
                      meta=meta,
                      callback=self.parse_images,
                      dont_filter=response.request.dont_filter)

        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid'])
        print('self.crawler.stats.inc_crawled_pages::::::::::', )
        yield req

예제 #11

0

파일 보기

파일: amazon_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item(self, response):
        self._logger.info("start response in parse_item -> response type:%s" %type(response).__name__)
        sel = Selector(response)
        item = AmazonItem()
        self._enrich_base_data(item, response, is_update=False)
        
        node_id_re = re.compile(r'node=(?P<node_id>\w+)')
        # breadcrum
        node_id_hrefs = sel.xpath('//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href').extract()
        item['node_ids'] = [node_id_re.search(x).group('node_id') for x in node_id_hrefs if node_id_re.search(x)]
        # Look for Similar Items by Category
        similar_node_id_links = [x.xpath('a/@href').extract() for x in sel.xpath('//div[@id="browse_feature_div"]/div/p')]
        item['similar_node_ids'] = [[node_id_re.search(x).group('node_id') for x in links] for links in [links for links in similar_node_id_links]]
        item['parent_asin'] = ''.join(sel.re(r'"parent_asin":"(.*?)"')).strip()
        if len(item['parent_asin']) == 0:
            item['parent_asin'] = ''.join(sel.xpath('//form[@id="addToCart"]/input[@id="ASIN"]/@value').extract()).strip()
        item['title'] = ''.join(sel.xpath('//span[@id="productTitle"]/text()').extract()).strip()
        item['product_specifications'] = format_html_string(''.join(sel.xpath('//div[@id="technicalSpecifications_feature_div"]//table').extract()).strip())
        item['product_description'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]//p/text()').extract()).strip())
        brand_href = ''.join(sel.xpath('//a[@id="brand"]/@href').extract()).strip()
        brand_re = re.compile(r'^/(?P<brand>.*)/b/')
        m = brand_re.search(brand_href)
        if m:
            brand = brand_re.search(brand_href).group('brand')
        else:
            brand = ''.join(sel.xpath('//a[@id="brand"]/text()').extract()).strip()
        item['brand'] = brand
        item['feature'] = format_html_string(''.join(sel.xpath('//div[@id="feature-bullets"]').extract()).strip())
        item['dimensions_display'] = safely_json_loads(format_html_string(''.join(sel.re(r'"dimensionsDisplay":(.*?]),')).strip()))
        item['variations_data'] = safely_json_loads(''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip())
        enrich_color_images(item, sel)

        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )

        return item

예제 #12

0

파일 보기

파일: jacobtime_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item(self, response):
        self._logger.info('JacobtimeSpider#parse_item...')
        item = JacobtimeItem()
        sel = Selector(response)
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)
        item['details'] = format_html_string(''.join(sel.xpath('//div[@id="tab1"]').extract()))
        item['image_urls'] = [urljoin(response.url, i) for i in sel.xpath('//a[@class="lightbox"]/@href').extract()]
        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )

        return item

예제 #13

0

파일 보기

파일: finishline_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item(self, response):
        print("FinishlineSpider#parse_item ...")
        self._logger.debug("FinishlineSpider#parse_item ...")
        sel = Selector(response)
        item = FinishlineItem()
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)

        item['title'] = ''.join(sel.xpath('//h1[@id="title"]/text()').extract()).strip()
        list_size = []
        sizes = sel.xpath('//div[@id="productSizes"]/div[@class="size"]')
        for size in sizes:
            list_size.append([
                ''.join(size.xpath('@id').extract()),
                ''.join(size.xpath('text()').extract())
            ])
        item['size'] = list_size
        item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]').extract()))
        item['product_images'] = json.loads(''.join(sel.re(r"JSON.parse\(\'(.*?)\'")).strip())
        item['links'] = ''.join(sel.re(r"links: \'(.*?)\'")).split(';')
        item['product_color'] = ''.join(sel.re(r'"product_color" : \["(.*?)\"'))
        item['style_color_ids'] = ''.join(sel.xpath('//div[@id="styleColors"]/span[@class="styleColorIds"]/text()').extract())

        colorid = ''.join(sel.xpath('//h1[@id="title"]/@data-colorid').extract())

        styleid = ''.join(sel.xpath('//h1[@id="title"]/@data-styleid').extract())

        imageset_url = 'http://www.finishline.com/store/api/scene7/imageset/?colorId=%s&styleId=%s' % (colorid,styleid)

        meta = response.meta
        meta['item-half'] = item
        req = Request(
                url=imageset_url,
                meta=meta,
                callback=self.parse_images,
                dont_filter=response.request.dont_filter
            )

        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )
        print('self.crawler.stats.inc_crawled_pages::::::::::',)
        yield req

예제 #14

0

파일 보기

파일: jacobtime_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item(self, response):
        self._logger.info('JacobtimeSpider#parse_item...')
        item = JacobtimeItem()
        sel = Selector(response)
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)
        item['details'] = format_html_string(''.join(
            sel.xpath('//div[@id="tab1"]').extract()))
        item['image_urls'] = [
            urljoin(response.url, i)
            for i in sel.xpath('//a[@class="lightbox"]/@href').extract()
        ]
        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid'])

        return item

예제 #15

0

파일 보기

파일: amazon_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item_update(self, response):
        self._logger.info("start response in parse_item_update -> response type:%s" % type(response).__name__)
        item = AmazonItem()
        meta = response.meta
        self._enrich_base_data(item, response, is_update=True)

        item['asin'] = re_search(r'product/(.*)/', response.url)
        sel = Selector(response)
        asin_divs = sel.xpath('//input[@id="ASIN"]/@value').extract()
        if len(asin_divs) > 0:
            item['parent_asin'] = ''.join(asin_divs[0]).strip()
        else:
            item['parent_asin'] = ''

        item['size'] = re_search(r'\"%s\":\[(.*?)\]' % item['asin'], ''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip())
        item['dimensions_display'] = safely_json_loads(format_html_string(''.join(sel.re(r'"dimensionsDisplay":(.*?]),')).strip()))
        item['merchants'] = sel.xpath('//div[@id="merchant-info"]/a/text()').extract()
        item['merchant_3p'] = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/b/text()').extract()).strip()
        item['price_3p'] = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/span[contains(@class, "price3P")]/text()').extract()).strip()
        shipping_cost_3p_string = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/span[contains(@class, "shipping3P")]/text()').extract()).strip()
        item['shipping_cost_3p'] = extract_shipping_cost_price_from_shipping_cost_string(shipping_cost_3p_string)
        item['from_price'] = ''.join(sel.xpath('//div[@id="mbc"]/div[@class="a-box"]/div/span/span[@class="a-color-price"]/text()').extract()).strip()
        availability_divs = [
            ''.join(sel.xpath('//div[@id="availability"]/span/text()').extract()),
            ''.join(sel.xpath('//span[@class="availRed"]/text()').extract()),
            ''.join(sel.xpath('//span[@class="availGreen"]/text()').extract())
            ]

        availability_str = ''.join(availability_divs).strip().lower()
        merchant_info_str = ''.join(sel.xpath('//div[@id="merchant-info"]/text()').extract()).strip().lower()
        if (
                (len(availability_divs) <= 0) or
                availability_str.startswith('only') or
                availability_str.startswith('in stock') or
                availability_str.startswith('usually')
        ):
            item['availability'] = 'true'
            item['availability_reason'] = "001: %s" % availability_str
        elif (
                merchant_info_str.startswith('ships from and sold by')
        ):
            item['availability'] = 'true'
            item['availability_reason'] = "002: %s" % merchant_info_str
        elif (
                availability_str.startswith('available from')
        ):
            item['availability'] = 'other'
            item['availability_reason'] = "003: %s" % availability_str
        elif availability_str.startswith('currently unavailable'):
            item['availability'] = 'false'
            item['availability_reason'] = "004: %s" % availability_str
        else:
            item['availability'] = 'false'
            item['availability_reason'] = '000: _'

        if item['availability'] in ['true']:
            item['list_price'] = ''.join([
                ''.join(sel.xpath('//div[@id="price"]//tr[1]/td[2]/text()').extract()).strip(),
                ''.join(sel.xpath('//span[@id="listPriceValue"]/text()').extract()).strip()
                ])

            item['price'] = ''.join([
                ''.join(sel.xpath('//span[@id="priceblock_ourprice"]/text()').extract()).strip(),
                ''.join(sel.xpath('//span[@id="priceblock_saleprice"]/text()').extract()).strip(),
                ''.join(sel.xpath('//span[@id="priceblock_dealprice"]/text()').extract()).strip(),
                ''.join(sel.xpath('//span[@id="actualPriceValue"]/b/text()').extract()).strip()
                ])

            if ((len(item['list_price']) + len(item['price'])) <= 0):
                #self.log("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body)))
                self._logger.info("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body)))
                dump_response_body(item['asin'], response.body)

            shipping_cost_string_ourprice = ''.join(sel.xpath('//*[@id="ourprice_shippingmessage"]/span/text()').extract()).strip()
            shipping_cost_string_saleprice = ''.join(sel.xpath('//*[@id="saleprice_shippingmessage"]/span/text()').extract()).strip()
            shipping_cost_string = shipping_cost_string_ourprice or shipping_cost_string_saleprice
            item['shipping_cost'] = extract_shipping_cost_price_from_shipping_cost_string(shipping_cost_string)
            self._logger.info("Spiderid: %s Crawlid: %s yield item in parse, asin: %s" % (response.meta['spiderid'],response.meta['crawlid'],item.get("asin", "unknow")))

            self.crawler.stats.inc_crawled_pages(
                crawlid=response.meta['crawlid'],
                spiderid=response.meta['spiderid'],
                appid=response.meta['appid']
            )
            return item
        elif item['availability'] in ['other']:
            item['price'] = ''.join([
                ''.join(sel.xpath('//*[@id="unqualifiedBuyBox"]//span[@class="a-color-price"]/text()').extract()).strip()
                ])

            new_url = ''.join(sel.xpath('//div[@id="unqualifiedBuyBox"]/div/div[1]/a/@href').extract()).strip()
            new_url = urljoin(response.url, new_url)

            meta['item_half'] = item

            req = Request(
                url=new_url,
                meta=meta,
                callback=self.parse_shipping_cost,
                dont_filter=response.request.dont_filter
            )
            self._logger.info("Spiderid: %s Crawlid: %s yield request in parse, asin: %s" % (response.meta['spiderid'],response.meta['crawlid'],req.meta.get("asin", "unknow")))
            return req
        else:
            self._logger.info("yield item in parse, asin: %s" % item.get("asin", "unknow"))
            self.crawler.stats.inc_crawled_pages(
                crawlid=response.meta['crawlid'],
                spiderid=response.meta['spiderid'],
                appid=response.meta['appid']
            )
            return item

예제 #16

0

파일 보기

파일: amazon_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item_update(self, response):
        self._logger.info(
            "start response in parse_item_update -> response type:%s" %
            type(response).__name__)
        item = AmazonItem()
        meta = response.meta
        self._enrich_base_data(item, response, is_update=True)

        item['asin'] = re_search(r'product/(.*)/', response.url)
        sel = Selector(response)
        asin_divs = sel.xpath('//input[@id="ASIN"]/@value').extract()
        if len(asin_divs) > 0:
            item['parent_asin'] = ''.join(asin_divs[0]).strip()
        else:
            item['parent_asin'] = ''

        item['size'] = re_search(
            r'\"%s\":\[(.*?)\]' % item['asin'],
            ''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip())
        item['dimensions_display'] = safely_json_loads(
            format_html_string(''.join(
                sel.re(r'"dimensionsDisplay":(.*?]),')).strip()))
        item['merchants'] = sel.xpath(
            '//div[@id="merchant-info"]/a/text()').extract()
        item['merchant_3p'] = ''.join(
            sel.xpath(
                '//div[@id="soldByThirdParty"]/b/text()').extract()).strip()
        item['price_3p'] = ''.join(
            sel.xpath(
                '//div[@id="soldByThirdParty"]/span[contains(@class, "price3P")]/text()'
            ).extract()).strip()
        shipping_cost_3p_string = ''.join(
            sel.xpath(
                '//div[@id="soldByThirdParty"]/span[contains(@class, "shipping3P")]/text()'
            ).extract()).strip()
        item[
            'shipping_cost_3p'] = extract_shipping_cost_price_from_shipping_cost_string(
                shipping_cost_3p_string)
        item['from_price'] = ''.join(
            sel.xpath(
                '//div[@id="mbc"]/div[@class="a-box"]/div/span/span[@class="a-color-price"]/text()'
            ).extract()).strip()
        availability_divs = [
            ''.join(
                sel.xpath('//div[@id="availability"]/span/text()').extract()),
            ''.join(sel.xpath('//span[@class="availRed"]/text()').extract()),
            ''.join(sel.xpath('//span[@class="availGreen"]/text()').extract())
        ]

        availability_str = ''.join(availability_divs).strip().lower()
        merchant_info_str = ''.join(
            sel.xpath('//div[@id="merchant-info"]/text()').extract()).strip(
            ).lower()
        if ((len(availability_divs) <= 0)
                or availability_str.startswith('only')
                or availability_str.startswith('in stock')
                or availability_str.startswith('usually')):
            item['availability'] = 'true'
            item['availability_reason'] = "001: %s" % availability_str
        elif (merchant_info_str.startswith('ships from and sold by')):
            item['availability'] = 'true'
            item['availability_reason'] = "002: %s" % merchant_info_str
        elif (availability_str.startswith('available from')):
            item['availability'] = 'other'
            item['availability_reason'] = "003: %s" % availability_str
        elif availability_str.startswith('currently unavailable'):
            item['availability'] = 'false'
            item['availability_reason'] = "004: %s" % availability_str
        else:
            item['availability'] = 'false'
            item['availability_reason'] = '000: _'

        if item['availability'] in ['true']:
            item['list_price'] = ''.join([
                ''.join(
                    sel.xpath('//div[@id="price"]//tr[1]/td[2]/text()').
                    extract()).strip(), ''.join(
                        sel.xpath('//span[@id="listPriceValue"]/text()').
                        extract()).strip()
            ])

            item['price'] = ''.join([
                ''.join(
                    sel.xpath('//span[@id="priceblock_ourprice"]/text()').
                    extract()).strip(), ''.join(
                        sel.xpath('//span[@id="priceblock_saleprice"]/text()').
                        extract()).strip(),
                ''.join(
                    sel.xpath('//span[@id="priceblock_dealprice"]/text()').
                    extract()).strip(), ''.join(
                        sel.xpath('//span[@id="actualPriceValue"]/b/text()').
                        extract()).strip()
            ])

            if ((len(item['list_price']) + len(item['price'])) <= 0):
                #self.log("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body)))
                self._logger.info(
                    "response body ILLEGAL: %s, %d, %d. Dumping ..." %
                    (item['asin'], response.status, len(response.body)))
                dump_response_body(item['asin'], response.body)

            shipping_cost_string_ourprice = ''.join(
                sel.xpath('//*[@id="ourprice_shippingmessage"]/span/text()').
                extract()).strip()
            shipping_cost_string_saleprice = ''.join(
                sel.xpath('//*[@id="saleprice_shippingmessage"]/span/text()').
                extract()).strip()
            shipping_cost_string = shipping_cost_string_ourprice or shipping_cost_string_saleprice
            item[
                'shipping_cost'] = extract_shipping_cost_price_from_shipping_cost_string(
                    shipping_cost_string)
            self._logger.info(
                "Spiderid: %s Crawlid: %s yield item in parse, asin: %s" %
                (response.meta['spiderid'], response.meta['crawlid'],
                 item.get("asin", "unknow")))

            self.crawler.stats.inc_crawled_pages(
                crawlid=response.meta['crawlid'],
                spiderid=response.meta['spiderid'],
                appid=response.meta['appid'])
            return item
        elif item['availability'] in ['other']:
            item['price'] = ''.join([
                ''.join(
                    sel.xpath(
                        '//*[@id="unqualifiedBuyBox"]//span[@class="a-color-price"]/text()'
                    ).extract()).strip()
            ])

            new_url = ''.join(
                sel.xpath('//div[@id="unqualifiedBuyBox"]/div/div[1]/a/@href').
                extract()).strip()
            new_url = urljoin(response.url, new_url)

            meta['item_half'] = item

            req = Request(url=new_url,
                          meta=meta,
                          callback=self.parse_shipping_cost,
                          dont_filter=response.request.dont_filter)
            self._logger.info(
                "Spiderid: %s Crawlid: %s yield request in parse, asin: %s" %
                (response.meta['spiderid'], response.meta['crawlid'],
                 req.meta.get("asin", "unknow")))
            return req
        else:
            self._logger.info("yield item in parse, asin: %s" %
                              item.get("asin", "unknow"))
            self.crawler.stats.inc_crawled_pages(
                crawlid=response.meta['crawlid'],
                spiderid=response.meta['spiderid'],
                appid=response.meta['appid'])
            return item

예제 #17

0

파일 보기

파일: amazon_spider.py 프로젝트: mtaziz/jaycluster

    def parse_item(self, response):
        self._logger.info("start response in parse_item -> response type:%s" %
                          type(response).__name__)
        sel = Selector(response)
        item = AmazonItem()
        self._enrich_base_data(item, response, is_update=False)

        node_id_re = re.compile(r'node=(?P<node_id>\w+)')
        # breadcrum
        node_id_hrefs = sel.xpath(
            '//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href'
        ).extract()
        item['node_ids'] = [
            node_id_re.search(x).group('node_id') for x in node_id_hrefs
            if node_id_re.search(x)
        ]
        # Look for Similar Items by Category
        similar_node_id_links = [
            x.xpath('a/@href').extract()
            for x in sel.xpath('//div[@id="browse_feature_div"]/div/p')
        ]
        item['similar_node_ids'] = [[
            node_id_re.search(x).group('node_id') for x in links
        ] for links in [links for links in similar_node_id_links]]
        item['parent_asin'] = ''.join(sel.re(r'"parent_asin":"(.*?)"')).strip()
        if len(item['parent_asin']) == 0:
            item['parent_asin'] = ''.join(
                sel.xpath('//form[@id="addToCart"]/input[@id="ASIN"]/@value').
                extract()).strip()
        item['title'] = ''.join(
            sel.xpath('//span[@id="productTitle"]/text()').extract()).strip()
        item['product_specifications'] = format_html_string(''.join(
            sel.xpath('//div[@id="technicalSpecifications_feature_div"]//table'
                      ).extract()).strip())
        item['product_description'] = format_html_string(''.join(
            sel.xpath('//div[@id="productDescription"]//p/text()').extract()).
                                                         strip())
        brand_href = ''.join(
            sel.xpath('//a[@id="brand"]/@href').extract()).strip()
        brand_re = re.compile(r'^/(?P<brand>.*)/b/')
        m = brand_re.search(brand_href)
        if m:
            brand = brand_re.search(brand_href).group('brand')
        else:
            brand = ''.join(
                sel.xpath('//a[@id="brand"]/text()').extract()).strip()
        item['brand'] = brand
        item['feature'] = format_html_string(''.join(
            sel.xpath('//div[@id="feature-bullets"]').extract()).strip())
        item['dimensions_display'] = safely_json_loads(
            format_html_string(''.join(
                sel.re(r'"dimensionsDisplay":(.*?]),')).strip()))
        item['variations_data'] = safely_json_loads(''.join(
            sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip())
        enrich_color_images(item, sel)

        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid'])

        return item