Exemplo n.º 1
0
    def start_reviews(self,
                      site_response,
                      product,
                      filter_other_sources=True,
                      sort=None,
                      dir=None,
                      extra_review_parser=None):
        self.headers['Host'] = self.bv_subdomain
        self.headers['Referer'] = site_response.url

        url_params = {
            'bv_subdomain': self.bv_subdomain,
            'bv_site_locale': self.bv_site_locale,
            'source_internal_id': product['source_internal_id']
        }

        review_url = self.FORMAT_URL.format(**url_params)
        if sort:
            review_url = set_query_parameter(review_url, 'sort', sort)
        if sort and dir:
            review_url = set_query_parameter(review_url, 'dir', dir)
        request = Request(review_url,
                          callback=self.parse_reviews,
                          headers=self.headers)
        request.meta['product'] = product
        request.meta['filter_other_sources'] = filter_other_sources
        request.meta['extra_review_parser'] = extra_review_parser
        return request
Exemplo n.º 2
0
    def parse_cat_filters(self, response):
        category_name = response.meta['category_name']
        base_url = get_base_url(response.url)
        category = CategoryItem()
        category["category_path"] = category_name
        category["category_leaf"] = category_name
        category["category_url"] = base_url
        yield category

        response_json = json.loads(response.body_as_unicode())
        specs = response_json['specs']

        for spec in specs:
            if spec['f_name'] == 'Manufacturer':
                db_name = spec['db_name']
            else:
                continue

            if 'options' in spec:
                options = spec['options']
                filter_key = 'id'

            if 'tree' in spec:
                options = spec['tree']
                filter_key = 'key'

            if not options or not filter_key:
                raise Exception("Cannot find all manufacturer values in %s" % \
                                json.dumps(spec))

            for option in options:
                manufacturer = option['title']
                filter_value = option[filter_key]
                products_url = self.products_url[category_name]

                products_url = set_query_parameter(products_url,
                                                   '_fil[0][field]', db_name)
                products_url = set_query_parameter(products_url,
                                                   '_fil[0][operator]', '=')
                products_url = set_query_parameter(products_url,
                                                   '_fil[0][value]',
                                                   filter_value)

                _headers = self.get_headers(response.url)

                request = Request(products_url,
                                  self.parse_product,
                                  headers=_headers,
                                  meta={
                                      'dont_merge_cookies': True,
                                      'dont_redirect': True
                                  },
                                  cookies={})
                request.meta['category'] = category
                request.meta['manufacturer'] = manufacturer
                yield request
            return
Exemplo n.º 3
0
    def parse_reviews(self, response):
        reviews_xpath = "//li[@itemprop='review']"
        pros_xpath = ".//li[contains(@class, 'review-pros-and-cons__attribute--pro')]//text()"
        cons_xpath = ".//li[contains(@class, 'review-pros-and-cons__attribute--con')]//text()"

        product = response.meta['product']

        if not 'last_date_db' in response.meta:
            bol_id = response.meta['bol_id']
            ean = response.meta.get('ean', None)
            yield product
            yield bol_id
            yield ean

            last_review_in_db = get_latest_user_review_date(
                self.mysql_manager, self.spider_conf['source_id'],
                bol_id["ID_kind"], bol_id["ID_value"])
        else:
            last_review_in_db = response.meta['last_date_db']

        review_items = get_review_items_from_microdata(self, 'USER', response,
                                                       product, reviews_xpath,
                                                       pros_xpath, cons_xpath)

        if not review_items:
            return

        for review in review_items:
            yield review

        #incremental scraping
        date = review['TestDateText']
        last_date_in_page = dateparser.parse(date, ["%Y-%m-%d"])

        if last_review_in_db > last_date_in_page:
            return

        offset = get_query_parameter(response.url, 'offset')
        if not offset:
            offset = self.default_offset
        offset = int(offset) + self.limit

        next_page_url = set_query_parameter(response.url, 'offset', offset)
        next_page_url = set_query_parameter(next_page_url, 'limit', self.limit)
        request = Request(next_page_url, callback=self.parse_reviews)
        request.meta['use_proxy'] = True
        request.meta['last_date_db'] = last_review_in_db
        request.meta['product'] = product
        yield request
Exemplo n.º 4
0
    def parse_product(self, response):
        sii_re = '-([^\-]+).html'
        product = ProductItem()

        product['TestUrl'] = response.url.split('#')[0]
        product['OriginalCategoryName'] = response.meta['category'][
            'category_path']
        product['ProductName'] = self.extract(response.xpath('//h1/text()'))
        product['PicURL'] = self.extract(
            response.xpath('//a[@itemprop="image"]/@href'))
        product['ProductManufacturer'] = self.extract(
            response.xpath('//span[@itemprop="brand"]/a/span/text()'))

        match = re.search(sii_re, response.url)
        if not match:
            return
        source_internal_id = match.group(1)
        product['source_internal_id'] = source_internal_id
        yield product

        review_xpath = "//ul[@class='pagNum']/@data-action"
        total_page_xpath = "//ul[@class='pagNum']/li[@class='next']/preceding-sibling::li[1]/text()"

        review_url = self.extract_xpath(response, review_xpath)
        total_pages = self.extract_xpath(response, total_page_xpath)
        if not total_pages:
            total_pages = 1
        latest_db_date = get_latest_user_review_date_by_sii(
            self.mysql_manager, self.spider_conf["source_id"],
            source_internal_id)
        if review_url:
            set_query_parameter(review_url, 'ReviewOrdering', '2')
            review_url = get_full_url(response, review_url)
            request = Request(url=review_url, callback=self.parse_reviews)
            request.meta['product'] = product
            request.meta['current_page'] = 1
            if total_pages:
                request.meta['total_pages'] = total_pages
            request.meta['latest_db_date'] = latest_db_date
            yield request
Exemplo n.º 5
0
    def parse_category(self, response):
        body = json.loads(response.body_as_unicode())
        category_path = response.meta.get('category_path', [])
        children_categories = body.get('categories', [])

        if children_categories:
            for _category in children_categories:
                _path = deepcopy(category_path)
                try:
                    if 'name' in _category:
                        _path.append(_category['name'])
                    elif 'Name' in _category:
                        _path.append(_category['Name'])
                except Exception, e:
                    print _category
                    raise e
                category_url = set_query_parameter(self.category_url, 'apikey',
                                                   self.open_api_key)
                category_url = set_query_parameter(category_url, 'ids',
                                                   _category['id'])
                request = Request(category_url, self.parse_category)
                request.meta['category_path'] = _path
                yield request
Exemplo n.º 6
0
    def parse_reviews(self, response):
        reviews = response.xpath('//div[contains(@class,"detRating")]')
        product = response.meta['product']
        date_xpath = './/div[@class="date"]/@content'
        rating_xpath = './/div[@class="rat"]/span[1]/text()'
        title_xpath = './/div[@class="title"]//text()'
        summary_xpath = './/div[@class="comm"]//text()'
        date = None
        for review in reviews:
            date = self.extract_xpath(review, date_xpath)
            rating = self.extract_xpath(review, rating_xpath)
            title = self.extract_xpath(review, title_xpath)
            summary = self.extract_all_xpath(review, summary_xpath)
            user_review = ReviewItem.from_product(product=product,
                                                  tp='USER',
                                                  date=date,
                                                  rating=rating,
                                                  title=title,
                                                  summary=summary)
            yield user_review

        current_page = response.meta['current_page']
        total_pages = response.meta['total_pages']
        latest_db_date = response.meta['latest_db_date']

        if not date:
            return
        latest_date_page = dateparser.parse(date, ["%Y-%m-%d"])

        if not total_pages:
            return

        if current_page == total_pages:
            return

        if latest_db_date:
            if latest_db_date > latest_date_page:
                return

        next_page = current_page + 1
        next_page_url = set_query_parameter(response.url, 'CurrentPage',
                                            next_page)
        print next_page_url

        request = Request(url=next_page_url, callback=self.parse_reviews)
        request.meta['product'] = product
        request.meta['current_page'] = next_page
        request.meta['total_pages'] = total_pages
        request.meta['latest_db_date'] = latest_db_date
        yield request
Exemplo n.º 7
0
    def parse(self, response):
        if 'page' in response.meta:
            page = response.meta['page']
        else:
            page = 1

        review_selectors = response.xpath("//div[@id='stream']//div[contains(@class,'article_box_wrap')]")
        review_url_xpath = "./a/@href"

        for review_selector in review_selectors:
            review_url = self.extract_all(review_selector.xpath(review_url_xpath))
            review_url = get_full_url(response, review_url)
            if review_url:
                request = Request(review_url, callback=self.parse_review)
                yield request

        if self.continue_to_next_page(response):
            next_page = page+1
            next_page_url = set_query_parameter(response.url, 'page', next_page)
            request.meta['page'] = next_page
            if next_page_url:
                request = Request(next_page_url, callback=self.parse)
                yield request
Exemplo n.º 8
0
    def parse_products(self, response):
        category = response.meta['category']
        body = json.loads(response.body_as_unicode())
        if body["totalResultSize"] == 0:
            return
        products = body.get('products', [])

        for raw_product in products:
            url = ''
            pic_url = ''
            for _url in raw_product['urls']:
                url = _url['value']
                if _url['key'] == "DESKTOP":
                    break

            for _image in raw_product['images']:
                pic_url = _image['url']
                if _image['key'] == "L":
                    break
            product_name = raw_product['title']
            source_internal_id = raw_product['id']
            manufacturer = raw_product.get(
                'specsTag',
                None)  # specTags == manufacturer? YES! For reasons...
            ean_value = raw_product['ean']

            product = ProductItem.from_response(
                response,
                category=category,
                product_name=product_name,
                source_internal_id=source_internal_id,
                url=url,
                manufacturer=manufacturer,
                pic_url=pic_url)

            bol_id = self.product_id(product,
                                     kind='bolcom_id',
                                     value=source_internal_id)
            if ean_value:
                ean = self.product_id(product, kind='EAN', value=ean_value)

            #go to review page
            review_url = self.review_url % source_internal_id
            request = Request(review_url, callback=self.parse_reviews)
            request.meta['use_proxy'] = True
            request.meta['product'] = product
            request.meta['bol_id'] = bol_id
            if ean:
                request.meta['ean'] = ean
            yield request

        #go to "next" page
        offset = get_query_parameter(response.url, 'offset')
        offset = int(offset) + self.limit
        if offset > body["totalResultSize"]:
            return

        next_page_url = set_query_parameter(response.url, 'offset', offset)
        request = Request(next_page_url, callback=self.parse_products)
        request.meta['category'] = category
        yield request
Exemplo n.º 9
0
    def start_requests(self):
        category_url = set_query_parameter(self.category_url, 'apikey',
                                           self.open_api_key)

        request = Request(category_url, self.parse_category)
        return [request]
Exemplo n.º 10
0
                                                   self.open_api_key)
                category_url = set_query_parameter(category_url, 'ids',
                                                   _category['id'])
                request = Request(category_url, self.parse_category)
                request.meta['category_path'] = _path
                yield request
        else:
            category = CategoryItem()
            category['category_leaf'] = body['originalRequest']['category'][
                'name']
            category['category_string'] = body['originalRequest']['category'][
                'id']
            category['category_path'] = ' | '.join(category_path)
            yield category

            product_url = set_query_parameter(self.product_url, 'apikey',
                                              self.open_api_key)
            product_url = set_query_parameter(product_url, 'ids',
                                              category['category_string'])
            product_url = set_query_parameter(product_url, 'limit', self.limit)
            product_url = set_query_parameter(product_url, 'offset', 0)

            if not self.should_skip_category(category):
                request = Request(product_url, callback=self.parse_products)
                request.meta['category'] = category
                yield request

    def parse_products(self, response):
        category = response.meta['category']
        body = json.loads(response.body_as_unicode())
        if body["totalResultSize"] == 0:
            return
Exemplo n.º 11
0
    def parse_product(self, response):
        category = response.meta['category']
        items = extruct_helper.get_microdata_extruct_items(
            response.body_as_unicode())
        ean_xpath = '//a[@data-ean]/@data-ean'
        brand_alt_xpath = "//meta[@property='product:brand']/@content"
        product = list(
            extruct_helper.get_products_microdata_extruct(
                items, response, category))
        if len(product) != 1:
            request = self._retry(response.request)
            yield request
            return

        product_dict = product[0]
        product = product_dict['product']

        if not product['ProductManufacturer']:
            product['ProductManufacturer'] = self.extract_xpath(
                response, brand_alt_xpath)

        yield product
        for product_id in product_dict['product_ids']:
            yield product_id

        ean_value = int(self.extract_xpath(response, ean_xpath))
        if ean_value:
            ean = self.product_id(product, kind='EAN', value=ean_value)
            yield ean

        first_page_review_xpath = "//ul[contains(@class, 'js-product-reviews-first')]/@data-href"
        next_page_review_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-href"
        reviews_per_page_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-per-page"
        total_reviews_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-all"
        initial_index_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-current-index"
        paging_parameter_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-url-param"

        first_page_review_url = self.extract_xpath(response,
                                                   first_page_review_xpath)
        if first_page_review_url:
            first_page_review_url = get_full_url(response,
                                                 first_page_review_url)
            first_page_review_url = set_query_parameter(
                first_page_review_url, 'sorting', 'LATEST')

            next_page_review_url = self.extract_xpath(response,
                                                      next_page_review_xpath)

            paging_meta = {}
            if next_page_review_url:
                last_review_db = get_latest_user_review_date_by_sii(
                    self.mysql_manager, self.spider_conf['source_id'],
                    product['source_internal_id'])
                next_page_review_url = get_full_url(response,
                                                    next_page_review_url)
                next_page_review_url = set_query_parameter(
                    next_page_review_url, 'sorting', 'LATEST')

                reviews_per_page = self.extract_xpath(response,
                                                      reviews_per_page_xpath)
                total_reviews = self.extract_xpath(response,
                                                   total_reviews_xpath)
                current_index = self.extract_xpath(response,
                                                   initial_index_xpath)
                paging_parameter = self.extract_xpath(response,
                                                      paging_parameter_xpath)
                paging_meta = {
                    'next_page_review_url': next_page_review_url,
                    'reviews_per_page': int(reviews_per_page),
                    'total_reviews': int(total_reviews),
                    'current_index': int(current_index),
                    'paging_parameter': paging_parameter,
                    'last_review_db': last_review_db
                }

            meta = {'product': product}
            headers = {
                'Referer': response.url,
                'X-Requested-With': 'XMLHttpRequest'
            }
            meta.update(paging_meta)

            request = Request(first_page_review_url,
                              meta=meta,
                              headers=headers,
                              callback=self.parse_reviews)
            yield request
Exemplo n.º 12
0
    def parse_reviews(self, response):
        product = response.meta['product']

        summary_xpath = ".//article/text()"
        rating_xpath = ".//meta[@itemprop='rating']/@content"
        title_xpath = ".//meta[@itemprop='summary']/@content"
        date_xpath = ".//meta[@itemprop='dtreviewed']/@content"
        author_xpath = ".//meta[@itemprop='reviewer']/@content"
        pros_xpath = ".//div[contains(@class, 'review-features') and " \
                     "contains(@class, 'review-pros')]/text()"
        cons_xpath = ".//div[contains(@class, 'review-features') and " \
                     "contains(@class, 'review-cons')]/text()"

        review_selectors = response.xpath('//li')
        for review_selector in review_selectors:
            rating = self.extract_xpath(review_selector, rating_xpath)
            title = self.extract_xpath(review_selector, title_xpath)
            date = self.extract_xpath(review_selector, date_xpath)
            author = self.extract_xpath(review_selector, author_xpath)
            summary = self.extract_all_xpath(review_selector, summary_xpath)
            pros = self.extract_all_xpath(review_selector, pros_xpath)
            cons = self.extract_all_xpath(review_selector, cons_xpath)

            pros = re.sub("[\s]+", ' ', pros)
            cons = re.sub("[\s]+", ' ', cons)

            review = ReviewItem.from_product(product=product,
                                             title=title,
                                             rating=rating,
                                             tp='USER',
                                             scale=5,
                                             date=date,
                                             summary=summary,
                                             pros=pros,
                                             cons=cons,
                                             author=author)
            yield review

        last_date_in_page = dateparser.parse(date, ["%Y-%m-%d"])
        next_page_url = response.meta.get('next_page_review_url', None)
        if next_page_url:
            paging_parameter = response.meta['paging_parameter']
            current_index = response.meta['current_index']
            reviews_per_page = response.meta['reviews_per_page']
            total_reviews = response.meta['total_reviews']
            last_review_db = response.meta['last_review_db']

            if current_index >= total_reviews:  #We reached the end
                return

            if last_review_db > last_date_in_page:  #reached the end of new data
                return

            next_page_url = set_query_parameter(next_page_url,
                                                paging_parameter,
                                                current_index)

            headers = {
                'Referer': response.request.headers['Referer'],
                'X-Requested-With':
                response.request.headers['X-Requested-With']
            }

            meta = {
                'next_page_review_url': next_page_url,
                'reviews_per_page': reviews_per_page,
                'total_reviews': total_reviews,
                'current_index': current_index + reviews_per_page,
                'paging_parameter': paging_parameter,
                'last_review_db': last_review_db,
                'product': product
            }

            request = Request(next_page_url,
                              meta=meta,
                              headers=headers,
                              callback=self.parse_reviews)
            yield request
Exemplo n.º 13
0
    def parse_product(self, response):
        #    self._check_if_blocked(response)
        category = response.meta['category']
        manufacturer = response.meta['manufacturer']
        base_url = get_base_url(response.url)

        json_response = json.loads(response.body_as_unicode())
        data = json_response["data"]
        image_url_format = "https://s3.graphiq.com/sites/default/files" \
                           "/%s/media/images/%s"
        product_url_format = "%s/l/%s/%s"
        product_name_fields = [
            "manufacturer_model", "manfacture_model", "company_product", "mm"
        ]
        name_index = ""
        amazon_asin_index = ""
        upc_index = ""

        app_id = data["app_id"]
        page = data["page"]
        results = data['recs']

        id_index = data["head"].index("id")

        encoded_title_index = data["head"].index("_encoded_title")

        for name_field in product_name_fields:
            try:
                index = data["head"].index(name_field)
                name_index = index
                break
            except ValueError:
                continue

        if not name_index:
            raise Exception("Could not find product name in %s" % response.url)

        image_index = data["head"].index("_i_1")
        if 'amazon_asin' in data["head"]:
            amazon_asin_index = data["head"].index("amazon_asin")
        if 'UPC' in data["head"]:
            upc_index = data["head"].index("UPC")
        elif 'upc' in data["head"]:
            upc_index = data["head"].index("upc")

        for product_data in data["data"]:
            image_name = ""
            if len(product_data[image_index]) > 1:
                image_name = product_data[image_index][0]
            product = ProductItem()
            product['OriginalCategoryName'] = category['category_path']
            if image_name:
                product['PicURL'] = image_url_format % (app_id, image_name)
            product['TestUrl'] = product_url_format % (
                base_url, product_data[id_index],
                product_data[encoded_title_index])

            product['ProductManufacturer'] = manufacturer
            if name_index:
                product['ProductName'] = product_data[name_index]

            yield product

            if upc_index:
                if product_data[upc_index]:
                    upc = ProductIdItem()
                    upc['ProductName'] = product['ProductName']
                    upc['ID_kind'] = "UPC"
                    upc['ID_value'] = product_data[upc_index]
                    yield upc

            if amazon_asin_index:
                if product_data[amazon_asin_index]:
                    asin = ProductIdItem()
                    asin['ProductName'] = product['ProductName']
                    asin['ID_kind'] = "ASIN"
                    asin['ID_value'] = product_data[amazon_asin_index]
                    yield asin

        number_of_pages = int(int(results) / 100)
        if page < number_of_pages:
            next_page_url = set_query_parameter(response.url, 'page', page + 1)
            _headers = self.get_headers(response.url)
            request = Request(next_page_url,
                              self.parse_product,
                              headers=_headers,
                              cookies={},
                              meta={
                                  'dont_merge_cookies': True,
                                  'dont_redirect': True
                              })
            request.meta['manufacturer'] = manufacturer
            request.meta['category'] = category
            yield request