def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1[contains(@class, "title")]/text()')) product['PicURL'] = self.extract(response.xpath('//meta[@property="og:image"]/@content')) product['ProductManufacturer'] = self.extract( response.xpath(self.brand_xpath)) product['source_internal_id'] = self.extract( response.xpath('//span[@class="details" and text()="SKU"]/following::span/text()')) yield product if product['source_internal_id']: sku_id = self.product_id(product, kind='sku', value=product['source_internal_id']) yield sku_id id_value = self.extract(response.xpath('//span[@itemprop="productID"]/text()')) if id_value: product_id = self.product_id(product, kind='MPN', value=id_value) yield product_id splitted = response.url.split('/') if splitted: review_url = self.review_url_prefix + splitted[-1].rstrip('.aspx') request = Request(url=get_full_url(response, review_url), callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_review(self, response): review_xpaths = { "TestTitle": "//meta[@property='og:title']/@content", "Author": "//div[@class='meta']/a/text()", "TestSummary": "//meta[@name='description']/@content" } review = self.init_item_by_xpaths(response, "review", review_xpaths) product = ProductItem() if not review['TestSummary']: review['TestSummary'] = self.extract( response.xpath("//meta[@property='og:description']/@content")) test_url = response.url internal_source_id = str(test_url).split('/')[4].rstrip('/') review['source_internal_id'] = internal_source_id product['source_internal_id'] = internal_source_id # product name title = (review['TestTitle']).encode('utf-8') if 'review' in title: product_name = title.replace(" review", "") elif 'Review' in title: product_name = title.replace(" Review", "") elif 'Video' in title: product_name = title.replace(" Video", "").split(":")[0] elif ':' in title: product_name = str(title).split(":")[0] else: product_name = title product_name = product_name.replace( " - Carryology - Exploring better ways to carry", "").replace(" Video", "").replace("Drive By", "").replace(":", "").replace( " |", "").replace(" Carryology", "") review['ProductName'] = product_name product['ProductName'] = product_name source_test_rating = self.extract( response.xpath("//div[@class='bar']/span[@class='score']/text()")) if source_test_rating: review['SourceTestRating'] = source_test_rating review['SourceTestScale'] = '10' review['TestUrl'] = test_url date_str = self.extract( response.xpath("//div[@class='meta']/text()[2]")) date = str(date_str).lstrip(", ") date_time = date_format(date, "%B %d, %Y") review['TestDateText'] = date_time review['DBaseCategoryName'] = 'PRO' product['TestUrl'] = test_url product['OriginalCategoryName'] = self.extract( response.xpath("//div[@class='breadcrumbs']//span/text()")) product['PicURL'] = self.extract( response.xpath('//meta[@property="og:image"]/@content')) yield review yield product
def parse_items(self, response): product = ProductItem() product['TestUrl'] = response.url product_name = self.extract( response.xpath('//meta[@property="og:title"]/@content')) product['ProductName'] = product_name.replace(" | EP:", "") product['PicURL'] = self.extract( response.xpath('//meta[@property="og:image"]/@content')) product['ProductManufacturer'] = self.extract( response.xpath("//div[@class='product-details-left']/a//@title")) product['source_internal_id'] = str(response.url).split("/")[5] yield product price_xpath = "//div/div[@class='product-details-price']//div/text()" price = self.extract(response.xpath(price_xpath)) if price: product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "price" product_id['ID_value'] = price.replace(".", "").rstrip(",-") yield product_id EAN_id_xpath = "//div[@class='product-flixdata']/@data-ean" EAN_id = self.extract(response.xpath(EAN_id_xpath)) if EAN_id: product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "EAN" product_id['ID_value'] = EAN_id yield product_id
def parse(self, response): all_review_button_xpath = "//a[contains(@class,'seeAllReviews')]" soup = BeautifulSoup(response.body, "lxml") #inspect_response(response, self) item_id = response.url.split('/')[-2].strip() product = ProductItem() product['source_internal_id'] = item_id product['ProductName'] = soup.find('span', { 'itemprop': 'name' }).text.strip() product['ProductManufacturer'] = soup.find('span', { 'itemprop': 'manufacturer' }).text.strip() ocn = [] ocn_paths = soup.find('ul', { 'class': 'Breadcrumb-list' }).find_all('span', {'itemprop': 'title'}) for item in ocn_paths: ocn.append(item.text.strip()) product['OriginalCategoryName'] = ' > '.join(ocn) product['PicURL'] = soup.find( 'img', {'class': 'js-ProductVisuals-imagePreview'})['src'].strip() product['TestUrl'] = response.url yield product yield self.get_rm_kidval(product, response) with SeleniumBrowser(self, response) as browser: selector = browser.get(response.url) all_review_button = response.xpath(all_review_button_xpath) if all_review_button: selector = browser.click( "//a[contains(@class,'seeAllReviews')]") for review in self._parse_reviews(selector, product, browser): yield review
def parse_product(self, response): category = response.meta['category'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] product['ProductName'] = '' product['PicURL'] = '' product_json_ld = extruct_helper.extract_json_ld( response.text, 'Product') if product_json_ld: product['ProductName'] = product_json_ld.get('name', '') product['PicURL'] = product_json_ld.get('image', '') else: # TODO: add fallback plan? return parsed_url = urlparse(response.url) splited = parsed_url.path.split('/') if splited: product["source_internal_id"] = splited[-1] yield product internal_id = self.product_id(product, kind='reevoo_internal_id', value=product['source_internal_id']) yield internal_id # TODO: test if the url is valid or not? review_url = self.review_url_format.format( product["source_internal_id"]) request = Request(review_url, callback=self.parse_review) request.meta['product'] = product yield request
def parse_product(self, response): reviews = response.xpath('//section[article[contains(@class,"review")]]') if reviews: product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = 'Cell Phones' product['ProductName'] = self.extract(response.xpath('//meta[@itemprop="name"]/@content')) pic_url = self.extract(response.xpath('//meta[@property="og:image"]/@content')) product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = self.extract(response.xpath('//meta[@itemprop="brand"]/@content')) yield product user_reviews = reviews.xpath('./article[@itemprop="review"]') for review in user_reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@class="time"]/text()')) user_review['TestDateText'] = date_format(date, '') user_review['SourceTestRating'] = self.extract(review.xpath('.//meta[@itemprop="ratingValue"]/@content')) user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()')) user_review['TestPros'] = self.extract_all(review.xpath( './/div[contains(@class,"positives")]/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath( './/div[contains(@class,"negatives")]/text()'), '; ') yield user_review pro_review_url = self.extract(reviews.xpath('./article[contains(@class,"expert")]/div/a/@href')) if pro_review_url: request = Request(url=get_full_url(response, pro_review_url), callback=self.parse_review) request.meta['product'] = product yield request
def parse_product(self, response): product = ProductItem() print response.url product['TestUrl'] = response.url product['OriginalCategoryName'] = self.extract_all( response.xpath('//ol[@id="breadcrumb-list"]/li/a/text()'), "->") product['ProductName'] = self.extract( response.xpath('//div[@class="type-subhead-alt-regular"]//text()')) product['PicURL'] = self.extract( response.xpath( '//div[@data-slide-number="0"]/div[@class="zoomable hammer-wrapper"]/img/@data-img-path' )) product['ProductManufacturer'] = self.extract( response.xpath('//meta[@id="schemaorg-brand-name"]/@content')) product['source_internal_id'] = self.extract( response.xpath( '//span[@id="sku-value" and @itemprop="productID"]/text()')) yield product request = Request(url="http://bestbuy.ugc.bazaarvoice.com/3545w/" + product['source_internal_id'] + "/reviews.djs?format=embeddedhtml", callback=self.parse_review) request.meta['product'] = product yield request
def parse_product(self, response): manufacturer_xpath = "//strong[contains(@class,'property-name') and contains(text(),'Hersteller')]/following-sibling::span/a[1]/text()" review_url_xpath = "//div[@id='product-head-reviews']//a[@class='headbutton']/@href" product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1//text()')) product['PicURL'] = self.extract(response.xpath('//div[@class="data"]/div/img/@src')) product['ProductManufacturer'] = self.extract(response.xpath(manufacturer_xpath)) yield product id_values = self.extract(response.xpath('//strong[contains(text(),"EAN")]/parent::div/span/text()')) if id_values: id_values = id_values.split(',') for id_value in id_values: productid = ProductIdItem() productid['ProductName'] = product["ProductName"] productid['ID_kind'] = "EAN" productid['ID_value'] = id_value.strip(' ') yield productid review_url = self.extract(response.xpath(review_url_xpath)) if review_url: review_url = get_full_url(response, review_url) request = Request(url=review_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_review(self, response): product = ProductItem() product_name_xpath = "//hearder[@class='gutter-top']/h1[@itemprop='name']/text()" ocn_xpath = "//div[@class='gutter-vertical']//span[@class='tags']/atext()" pic_url_xpath = "//meta[@property='og:image']/text()" product['ProductName'] = self.extract(response.xpath(product_name_xpath)) product['OriginalCategoryName'] = response.meta['category'] product['PicURL'] = self.extract(response.xpath(pic_url_xpath)) yield product testTitle_xpath = "//meta[@property='og:title']/text()" testSummary_xpath = "//div[@class='segment-article gutter-bottom-lg']div[class='row']/div/p/text()" author_xpath = ".//span[@class='review-created-by']/text()" testDateText_xpath = ".//span[@class='review-created-by']/text()" sourceTestRating_xpath = ".//span[@class='review-rating']/img/@src" review = ReviewItem() review["TestUrl"] = response.url review["DBaseCategoryName"] = "USER" review["SourceTestScale"] = "5"; review["ProductName"] = product["ProductName"] review["TestTitle"] = self.extract_all(response.xpath(testTitle_xpath)) review["TestSummary"] = self.extract_all(response.xpath(testSummary_xpath), " ") review["Author"] = self.extract(response.xpath(author_xpath)) review["TestDateText"] = self.extract(response.xpath(testDateText_xpath))
def parse_reviews(self, response): review = ReviewItem() product = ProductItem() contents = response.xpath('//article[@class="post-content"]') for content in contents: title = self.extract( content.xpath('.//div//h1[@class="post-title"]//text()')) test_url = self.extract( content.xpath('.//div//h1[@class="post-title"]//a/@href')) author = self.extract( content.xpath('.//span[@itemprop="name"]/text()')) date_str = self.extract_all( content.xpath('.//meta[@itemprop="datePublished"]/@content')) date = date_format(date_str, '%Y-%m-%d') pic = self.extract(content.xpath('.//img/@src')) sumamry = self.extract_all( content.xpath('.//div[@itemprop="articleBody"]//text()')) sid = test_url.split('/')[-2] # product items product['ProductName'] = title product['PicURL'] = pic product['source_internal_id'] = sid product['TestUrl'] = test_url # review review['ProductName'] = title review['TestTitle'] = title review['TestSummary'] = sumamry review['TestUrl'] = test_url review['DBaseCategoryName'] = 'pro' review['source_internal_id'] = sid review['TestDateText'] = date review['Author'] = author yield review yield product
def parse_product(self, response): item = response.meta['item'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = item['ocn'] product['ProductName'] = item['name'] product['PicURL'] = get_full_url( response.url, self.extract(response.xpath('//img[@itemprop="image"]/@src'))) product["ProductManufacturer"] = self.extract( response.xpath('//span[@itemprop="brand"]/text()')) yield product mpn_id_xpath = '//div[text()="Partnumber"]/parent::div/div[contains(@class,"value")]/text()' ean_id_xpath = '//div[text()="EAN"]/parent::div/div[contains(@class,"value")]/text()' mpn_id = self.extract(response.xpath(mpn_id_xpath)) ean_id = self.extract(response.xpath(ean_id_xpath)) if mpn_id.strip() > '-': mpn = ProductIdItem() mpn['ProductName'] = item['name'] mpn['ID_kind'] = "MPN" mpn['ID_value'] = mpn_id yield mpn if ean_id.strip() > '-': ean = ProductIdItem() ean['ProductName'] = item['name'] ean['ID_kind'] = "EAN" ean['ID_value'] = ean_id yield ean
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['ocn'] name = self.extract( response.xpath('//h1[@id="productNameHeader"]/text()')) product['PicURL'] = self.extract( response.xpath('//img[@id="_imgLarge"]/@src')) product['source_internal_id'] = self.extract( response.xpath('//span[@class="jsSwatchSku"]/text()')) mpn = self.extract( response.xpath('//p[contains(text(),"Item Number")]/span/text()')) if mpn: product_id = ProductIdItem() product["ProductName"] = name + ' ' + mpn product_id['ProductName'] = product["ProductName"] product_id['source_internal_id'] = product['source_internal_id'] product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn yield product yield product_id else: product["ProductName"] = name yield product test_url = 'http://api.bazaarvoice.com/data/reviews.json?apiversion=%s&passkey=%s&Filter=ProductId:s%s' \ '&Sort=SubmissionTime:desc&Limit=100' % (self.bv_version, self.bv_key, product['source_internal_id']) request = Request(url=test_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_reviews(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category'][ 'category_path'] product['ProductName'] = self.extract(response.xpath('//h1/a/text()')) product['PicURL'] = self.extract( response.xpath('//meta[@property="og:image"]/@content')) product['ProductManufacturer'] = self.extract( response.xpath('//meta[@itemprop="brand"]/@content')) product['source_internal_id'] = self.extract( response.xpath('//@data-product-id')) yield product reviews = response.xpath('//li[@class="opinion-row"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['source_internal_id'] = product['source_internal_id'] date = self.extract( review.xpath('.//meta[@itemprop="datePublished"]/@content')) user_review['TestDateText'] = date_format(date, "%Y %m %d") user_review['SourceTestRating'] = self.extract( review.xpath('.//meta[@itemprop="ratingValue"]/@content')) user_review['Author'] = self.extract(review.xpath('.//h4/text()')) user_review['TestTitle'] = self.extract( review.xpath('.//div[contains(@class,"grade-text")]/text()')) user_review['TestSummary'] = self.extract_all( review.xpath('.//div[@itemprop="description"]/text()')) yield user_review
def parse_product_review(self, response): # print " ...PARSE_PRODUCT_REVIEW: " + response.url title_xpath = '//meta[@property="og:title"]/@content' title = response.xpath(title_xpath).get() dont_scrape_words = ['Headphone Battle', 'Comparison', 'Comparisons'] scrape = True for w in dont_scrape_words: if w in title: scrape = False break if scrape: # REVIEW ITEM -------------------------------------------------- review_xpaths = { 'TestTitle': '//meta[@property="og:title"]/@content', 'TestSummary': '//meta[@property="og:description"]/@content', } # Create the review review = self.init_item_by_xpaths(response, "review", review_xpaths) # 'ProductName' r_title = review['TestTitle'] review['ProductName'] = \ self.get_product_name_based_on_title(r_title) # 'Author' review['Author'] = response.meta.get('author') # 'TestDateText' review['TestDateText'] = response.meta.get('date') # 'DBaseCategoryName' review['DBaseCategoryName'] = 'PRO' # 'source_internal_id' '''sid_xpath = '//link[@rel="shortlink"]/@href' sid = response.xpath(sid_xpath).get() sid = sid.split('?p=')[-1] review['source_internal_id'] = sid''' review['source_internal_id'] = response.meta.get('sid') # PRODUCT ITEM ------------------------------------------------- product = ProductItem() product['source_internal_id'] = review['source_internal_id'] product['OriginalCategoryName'] = response.meta.get('cat') product['ProductName'] = review['ProductName'] pic_url_xpath = '//meta[@property="og:image"]/@content' pic_url = response.xpath(pic_url_xpath).get() product['PicURL'] = pic_url product['TestUrl'] = response.url yield review yield product
def init_item_by_xpaths(self, response, item_type, fields, selector=None): if not selector: selector = Selector(response=response) if item_type not in ('review', 'product', 'product_id', 'category'): raise Exception("Invalid item type: %s" % item_type) if item_type == "review": item = ReviewItem() elif item_type == "product": item = ProductItem() elif item_type == "product_id": item = ProductIdItem() elif item_type == "category": item = CategoryItem() if item_type in ('review', 'product'): item["TestUrl"] = response.url for field in fields: # TODO: maybe check field. if item_type == "review" and field in ("TestPros, TestCons"): item[field] = self.extract_all(selector.xpath(fields[field]), " ; ") else: item[field] = self.extract_all(selector.xpath(fields[field])) return item
def parse_product(self, response): product = ProductItem() mpn = self.extract(response.xpath('//span[@id="lblMfgPartNo"]/text()')) product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['ocn'] product['PicURL'] = self.extract( response.xpath('//meta[@itemprop="image"]/@content')) product['ProductManufacturer'] = self.extract( response.xpath('//meta[@itemprop="brand"]/@content')) product['ProductName'] = product['ProductManufacturer'] + ' ' + mpn product['source_internal_id'] = self.extract( response.xpath('//span[@id="lblCatalog"]/text()')) yield product product_id = self.product_id(product) product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn product_id['source_internal_id'] = product['source_internal_id'] yield product_id review_id = self.extract( response.xpath('//a[@name="aReviews"]/@onclick')) id_match = re.findall(r"','([\d]+)'", review_id) review_url = 'http://www.buydig.com/shop/productreviews.aspx?sku=&pageid=%s&srt=DateNew&lmt=50' % id_match[ 0] request = Request(url=review_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_reviews(self, response): category = response.meta['category'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] product['ProductName'] = self.extract(response.xpath('//span[@class="fn"]/text()')) product_id = response.meta['product_id'] product['PicURL'] = 'http://geizhals.at/p/'+product_id+'.jpg' product['source_internal_id'] = product_id yield product reviews = response.xpath('//li[contains(@class,"gh_box")]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//div[@class="userbox"]/text()')).strip('am ') user_review['TestDateText'] = date_format(date, "%d.%m.%Y %H:%M") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="rating"]/text()')) user_review['Author'] = self.extract(review.xpath('.//span[contains(@class,"nick")]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h3//text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@itemprop="description"]//text()')) user_review['source_internal_id'] = product['source_internal_id'] yield user_review
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['ocn'] product['ProductName'] = self.extract_all(response.xpath( '//div[@class="hilo-navegacion"]/descendant::span[last()]/text()')) product['PicURL'] = self.extract(response.xpath('//a[@id="imagen-principal-1"]/@href')) product['ProductManufacturer'] = self.extract(response.xpath('//span[@itemprop="brand"]/text()')) product['source_internal_id'] = self.extract(response.xpath('//li[@id="id_articulo"]/@data-id')) yield product mpn = self.extract(response.xpath('//span[@itemprop="productID"]/@content')) if mpn: product_id = self.product_id(product) product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn.strip('mpn:') product_id['source_internal_id'] = product['source_internal_id'] yield product_id review_url = 'http://www.pccomponentes.com/comentarios/inc_pagina_comentarios.php?id_articulo=%s' \ '&orden=recientes' % product['source_internal_id'] request = Request(url=review_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def test_product(self): product = ProductItem() product['source_id'] = 7654321 product['source_internal_id'] = "Squibobble12387" product['ProductName'] = "Awesome fake product #1" product['OriginalCategoryName'] = "Fake products" product[ 'PicURL'] = "http://totes.fake.website.com/fake_products/pics/fake_product_of_awesome.jpg" product['ProductManufacturer'] = "ACME" product[ 'TestUrl'] = "http://totes.fake.website.com/fake_products/fake_product_of_awesome.html" assert product._name == "product", "ProductItem _name field incorrect" assert product[ 'source_id'] == 7654321, "ProductItem source_id incorrectly set" assert product[ 'source_internal_id'] == "Squibobble12387", "ProductItem source_internal_id incorrectly set" assert product[ 'ProductName'] == "Awesome fake product #1", "ProductItem ProductName incorrectly set" assert product[ 'OriginalCategoryName'] == "Fake products", "ProductItem OriginalCategoryName incorrectly set" assert product['PicURL'] == "http://totes.fake.website.com/fake_products/pics/fake_product_of_awesome.jpg", \ "ProductItem PicURL incorrectly set" assert product[ 'ProductManufacturer'] == "ACME", "ProductItem ProductManufacturer incorrectly set" assert product['TestUrl'] == "http://totes.fake.website.com/fake_products/fake_product_of_awesome.html", \ "ProductItem TestUrl incorrectly set"
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url ocn = self.extract(response.xpath( '//script[@type="text/javascript"][contains(text(),"sectionValue")]/text()')) ocn_match = re.findall(r'sectionValue = "([^"]+)"', ocn) product['OriginalCategoryName'] = ocn_match[0] product['ProductName'] = self.extract(response.xpath('//h1/span[@itemprop="name"]/text()')) pic_url = self.extract(response.xpath('//ul/li[1]/img[@itemprop="image"]/@src')) if pic_url: pic_url = get_full_url(response, pic_url) product['PicURL'] = pic_url product['ProductManufacturer'] = 'HP' yield product mpn = self.extract_list(response.xpath('//span[@class="prodNum"]/text()')) if mpn: product_id = self.product_id(product) product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn[0] yield product_id reviews = response.xpath('//div[@itemprop="review"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['TestDateText'] = self.extract(review.xpath('./meta[@itemprop="datePublished"]/@content')) user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()')) user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//span[@itemprop="name"]/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//span[@itemprop="description"]//text()')) yield user_review
def parse_product_review(self, response): # print " ...PARSE_PRODUCT_REVIEW: " + response.url # REVIEW ITEM ------------------------------------------------------ review_xpaths = { 'TestTitle': '//meta[@property="og:title"]/@content', 'TestSummary': '//meta[@property="og:description"]/@content', } # Create the review review = self.init_item_by_xpaths(response, "review", review_xpaths) # 'ProductName' title = review['TestTitle'] review['ProductName'] = self.get_product_name_based_on_title(title) # 'Author' authors_xpath = '//span[@class="entry-info-author"]//text()' authors = response.xpath(authors_xpath).get() # 'TestDateText' review['TestDateText'] = response.meta.get('date') # 'DBaseCategoryName' review['DBaseCategoryName'] = 'PRO' # 'TestPros' 'TestCons' pros_xpath = '//div[@class="color-green plus-wrapper col-sm-6 '\ 'col-xs-12"]/ul/li//text()' pros = response.xpath(pros_xpath).getall() if pros: pros = ";".join(pros) review['TestPros'] = pros cons_xpath = '//div[@class="color-red minus-wrapper '\ 'col-sm-6 col-xs-12"]/ul/li//text()' cons = response.xpath(cons_xpath).getall() if cons: cons = ";".join(cons) review['TestCons'] = cons # 'source_internal_id' review['source_internal_id'] = response.meta.get('sid') # ------------------------------------------------------------------ # PRODUCT ITEM ----------------------------------------------------- product = ProductItem() product['source_internal_id'] = review['source_internal_id'] product['OriginalCategoryName'] = response.meta.get('cat') product['ProductName'] = review['ProductName'] pic_url_xpath = '//meta[@property="og:image"]/@content' pic_url = response.xpath(pic_url_xpath).get() product['PicURL'] = pic_url product['TestUrl'] = response.url # ------------------------------------------------------------------ yield review yield product
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = self.extract(response.xpath('//meta[@name="ProductName"]/@content')) product['ProductName'] = self.extract(response.xpath( '//h1[@class="bar_3-heading"]/text() |' '//h1[@itemprop="name"]/text()')) pic_url = self.extract_list(response.xpath( '(//img[contains(@class,"heroImg")]/@src) |' '(//div[@class="productImg"]/img/@src) |' '//div[@id="galleria-stage"]//@src')) if pic_url: pic_url = get_full_url(response, pic_url[0]) product['PicURL'] = pic_url product['ProductManufacturer'] = 'Lenovo' yield product mpn = self.extract(response.xpath('//meta[@name="PartNumber"]/@content')) if mpn: product_id = self.product_id(product) product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn yield product_id category_id = self.extract(response.xpath('//meta[@name="metacategoryidentifier"]/@content')) test_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=%s&apiversion=%s' \ '&displaycode=%s&resource.q0=reviews&filter.q0=isratingsonly:eq:false' \ '&filter.q0=productid:eq:%s_%s' \ '&filter.q0=contentlocale:eq:en_US&sort.q0=submissiontime:desc&limit.q0=100&offset.q0=0' % \ (self.bv_key, self.bv_version, self.bv_code, category_id, self.bv_id) request = Request(url=test_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = self.extract( response.xpath('//a[contains(@class,"breadcrumb")]/text()')) model = self.extract( response.xpath('//span[@itemprop="model"]/text()')) pic_url = self.extract( response.xpath( '//meta[@name="analytics-product-image_url"]/@content')) if pic_url: product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = 'Sony' product['ProductName'] = product['ProductManufacturer'] + ' ' + model yield product id_values = self.extract(response.xpath('//@data-model_ids')) if id_values: id_values = id_values.strip('[').strip(']').split(',') for id_value in id_values: product_id = ProductIdItem() product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "MPN" product_id['ID_value'] = id_value yield product_id review_url = response.url + '/reviews-ratings' request = Request(url=review_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_product(self, response): sii = self.extract( response.xpath('//input[@class="addedItemInput"]/@value')) if sii: product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category'][ 'category_path'] product['ProductName'] = self.extract( response.xpath('//h1[@itemprop="name"]/text()')) product['PicURL'] = self.extract( response.xpath('//img[@itemprop="image"]/@src')) product['ProductManufacturer'] = self.extract( response.xpath( '//span[contains(text(),"Brand")]/following-sibling::text()' )) product['source_internal_id'] = sii yield product test_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=%s&apiversion=%s' \ '&displaycode=%s&resource.q0=reviews&filter.q0=isratingsonly:eq:false' \ '&filter.q0=productid:eq:%s' \ '&filter.q0=contentlocale:eq:en_US&sort.q0=submissiontime:desc&limit.q0=100&offset.q0=0' % \ (self.bv_key, self.bv_version, self.bv_code, product['source_internal_id']) request = Request(url=test_url, callback=self.parse_reviews) request.meta['product'] = product yield request
def parse_items(self, response): product_id = ProductIdItem() price = response.xpath( '//*[@id="priceCol"]/div[2]/text()').extract() product_id['ProductName'] = self.extract( response.xpath('//*[@id="cart_quantity"]/div/div[2]/h1/text()')) product_id['source_internal_id'] = self.extract(response.xpath('//span[@class="sku-model"]/text()')) if price: product_id['ID_kind'] = 'price' product_id['ID_value'] = str(price).split()[4].replace( "u'\\xa0", "").replace("*", "") EAN_id_xpath = '//span[@class="product-ean"]/text()' EAN_id = self.extract(response.xpath(EAN_id_xpath)) if EAN_id: product_id['ID_kind'] = "EAN" product_id['ID_value'] = EAN_id yield product_id product = ProductItem() product['source_internal_id'] = self.extract(response.xpath('//span[@class="sku-model"]/text()')) product['ProductName'] = self.extract(response.xpath( '//*[@id="cart_quantity"]/div/div[2]/h1/text()')) picture = response.xpath( '//*[@id="bImageCarousel"]/div/div[1]/a/img').extract() if picture: product['PicURL'] = str(picture).split('=')[1].replace("alt", "").replace("\'", "").replace(" \"", "").replace("\"", "") product['OriginalCategoryName'] = self.extract(response.xpath( '//*[@id="bBreadcrumb"]/ol/li/a/span/text()')) product['TestUrl'] = response.url yield product
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1[@itemprop="itemreviewed"]/text()')) product['PicURL'] = self.extract(response.xpath('//div[@class="productPhotoGallery"]/div/img/@src')) product['ProductManufacturer'] = self.extract(response.xpath( '//div[@class="manufacturer"]//span[not(text()="brak")]/text()')) yield product reviews = response.xpath( '//div[@class="opinion"][not(descendant::a[contains(text(),"Opinia z serwisu Ceneo.pl")])]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@class="date"]/text()')) user_review['TestDateText'] = date_format(date, "%Y-%m-%d") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@class="points"]/text()')) user_review['Author'] = self.extract_all(review.xpath('.//*[@class="profileName"]//text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//div[@class="text"]//text()')) user_review['TestPros'] = self.extract_all(review.xpath('.//ul[@class="pluses"]//span/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath('.//ul[@class="minuses"]//span/text()'), '; ') yield user_review
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1/text()')) product['PicURL'] = self.extract(response.xpath('//div[@class="images"]/a/img/@src')) product['ProductManufacturer'] = self.extract( response.xpath('//span[text()="Marca"]/parent::li/span[@class="value"]/text()')) product['source_internal_id'] = self.extract(response.xpath('//input[@id="prodId"]/@value')) yield product reviews = response.xpath('//article[@itemscope]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['source_internal_id'] = product['source_internal_id'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//div[@class="date"]/text()')) date_match = re.findall(r'[\d/]{10}', date) if date_match: user_review['TestDateText'] = date_format(date_match[0], "%d/%m/%Y") user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()')) user_review['Author'] = self.extract(review.xpath('.//h2/a/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//h3/a/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//p[@itemprop="reviewBody"]/text()')) user_review['TestPros'] = self.extract_all(review.xpath('.//div[@class="pro"]//li/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath('.//div[@class="con"]//li/text()'), '; ') yield user_review
def parse_product(self, response): category = response.meta['category'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] product["ProductManufacturer"] = self.extract(response.xpath('//a[@class="brand"]/text()')) product['PicURL'] = self.extract(response.xpath('//meta[@property="og:image"]/@content')) product['source_internal_id'] = self.extract(response.xpath('//div[@id="pdpFRdivMain"]/@data-productid')) mpn = self.extract(response.xpath( '//dt[@data-cerberus="txt_pdp_sizetitle"]/parent::dl/dd[not(contains(text(),"Taille"))]/text()')) if mpn: product['ProductName'] = product["ProductManufacturer"] + ' ' + mpn product_id = self.product_id(product) product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn yield product_id else: name = self.extract(response.xpath('//h2[@itemprop="name"]/text()')) product['ProductName'] = product["ProductManufacturer"] + ' ' + name yield product review_url = self.extract(response.xpath('//a[@class="read-reviews"]/@href')) review_url = get_full_url(response, review_url) with SeleniumBrowser(self, response) as browser: selector = browser.get(review_url, timeout=10) response.meta['browser'] = browser response.meta['product'] = product response.meta['_been_in_decorator'] = True for review in self.parse_reviews(response, selector, incremental=True): yield review
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = response.meta['category']['category_path'] product['ProductName'] = self.extract(response.xpath('//h1/text()')) pic_url = self.extract(response.xpath('//div[@class="product-carousel"]//img[@itemprop="image"][1]/@src')) product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = self.extract(response.xpath( '//td[text()="Constructeur"]/following-sibling::td/text()')) yield product reviews = response.xpath('//li[@itemprop="review"]') for review in reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@itemprop="datePublished"]/text()')) user_review['TestDateText'] = date_format(date, '%d/%m/%Y') user_review['SourceTestRating'] = self.extract(review.xpath('.//span[@itemprop="ratingValue"]/text()')) user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()')) user_review['TestTitle'] = self.extract(review.xpath('.//div[@itemprop="name"]/text()')) user_review['TestSummary'] = self.extract_all(review.xpath('.//blockquote/text()')) yield user_review
def parse_product_review(self, response): # print " ...PARSE_PRODUCT_REVIEW: " + response.url # REVIEW ---------------------------------------------------------- review_xpaths = { 'TestTitle': '//meta[@property="og:title"]/@content', 'Author': '(//span[@class="author vcard"])[1]/text()', 'TestSummary': '//meta[@property="og:description"]/@content' } # Create the review review = self.init_item_by_xpaths(response, "review", review_xpaths) # 'ProductName' header = self.extract( response.xpath("//h1[@class='title-primary']/text()")) review['ProductName'] = header.replace(" Review", "").replace(" review", "") # 'TestDateText' review['TestDateText'] = response.meta.get('date') # 'DBaseCategoryName' review['DBaseCategoryName'] = 'PRO' # 'TestPros' 'TestCons' pros_xpath = '//div[@class="review-points__pros"]//li/'\ 'span[@class="point"]/text()' pros = response.xpath(pros_xpath).getall() pros = ";".join(pros) cons_xpath = '//div[@class="review-points__cons"]//li/'\ 'span[@class="point"]/text()' cons = response.xpath(cons_xpath).getall() cons = ";".join(cons) if pros and cons: review['TestPros'] = pros review['TestCons'] = cons # 'source_internal_id' review['source_internal_id'] = response.url.split('/')[-1] # --------------------------------------------------------------------- # PRODUCT ------------------------------------------------------------- product = ProductItem() product['source_internal_id'] = review['source_internal_id'] product['OriginalCategoryName'] = response.meta.get('cat') product['ProductName'] = review['ProductName'] pic_url_xpath = '(//meta[@property="og:image"])[1]/@content' pic_url = response.xpath(pic_url_xpath).get() product['PicURL'] = pic_url product['TestUrl'] = response.url # --------------------------------------------------------------------- yield review yield product