def process(self, html): if html == '' or html == 'None': print "Can't get them html from https://www.amazon.co.jp" sys.exit() tree = etree.HTML(html) data = [] # 总产品页 listDoms = tree.xpath( "//*[contains(@class, 's-result-list-parent-container')]/ul/li") # print (len(listDoms)) if (listDoms): for itemDom in listDoms: # print (itemDom) item = {} # 抓取asin try: asin = itemDom.xpath("@data-asin") if (asin): item['asin'] = asin[0].strip() # print (item['asin']) except: print("asin error") # 抓取title try: titleDom = itemDom.xpath( "div//a[contains(@class, 's-access-detail-page')]/@title" ) if (titleDom): item['title'] = titleDom[0].strip() # print (item['title']) except: print("title error") # 抓取image try: imageDom = itemDom.xpath( "div//img[contains(@class, 's-access-image')]/@src") if (imageDom): item['image'] = Model_Processor().formatImage( imageDom[0]) # print (item['image_url']) imagewidthDom = itemDom.xpath( "div//img[contains(@class, 's-access-image')]/@width" ) if (imagewidthDom): item['width'] = imagewidthDom[0].strip() # print (item['image_width']) imageheightDom = itemDom.xpath( "div//img[contains(@class, 's-access-image')]/@height" ) if (imageheightDom): item['height'] = imageheightDom[0].strip() # print (item['image_height']) except: print("image error") # 抓取price try: priceDom = itemDom.xpath( "div//span[contains(@class, 's-price')]/text()") if (priceDom): item['price'] = Model_Processor().formatNumber( priceDom[0], "co.jp") except: print("price error") # 抓取list_price try: list_priceDom = itemDom.xpath( "div//span[contains(@class, 'a-text-strike')]/text()") if (list_priceDom): item['list_price'] = Model_Processor().formatNumber( list_priceDom[0], "co.jp") except: print("list_price error") # 抓取rating try: ratingDom = itemDom.xpath( "div//i[contains(@class, 'a-icon-star')]/span/text()") if (ratingDom): item['seller_rating'] = Model_Processor().formatRating( ratingDom[0], "co.jp") # print (item['seller_rating']) except Exception as err: print("seller_rating error") # 抓取review_count try: review_countDom = itemDom.xpath( "div//div[@class='a-row a-spacing-none']/a/text()") if (review_countDom): item['review_count'] = Model_Processor().formatNumber( review_countDom[0], "co.jp") # print (item['review_count']) except: print("review_count error") # 抓取bestseller try: bestsellerDom = itemDom.xpath( "div//span[contains(@class, 'sx-bestseller')]/a/@href") href = bestsellerDom[0].replace("/gp/bestsellers/", "").split("/") if (len(href) > 2): item['bestseller_search_index'] = href[0] item['bestseller_browse_node_id'] = href[1] # print (item['bestseller_search_index']) # print (item['bestseller_browse_node_id']) elif (len(href) == 2): item['bestseller_search_index'] = href[0] # print (item['bestseller_search_index']) except: pass # print (item) data.append(item) # # 产品结果页1 btfResult # listDom1 = tree.xpath("//*[@id='atfResults']/ul/li") # # print (len(listDom1)) # # 产品结果页2 btfResult # listDom2 = tree.xpath("//*[@id='btfResults']/ul/li") # # print (len(listDom2)) # print (len(data)) return data
def process(self, html): if html == '' or html == 'None': print "Can't get them html from https://www.amazon.it" sys.exit() tree = etree.HTML(html) data = {} # sellerLogo 卖家标志 //*[@id="sellerLogo"] try: logoDom = tree.xpath("//*[@id='sellerLogo']/@src") # print (logoDom) if (logoDom): data['logo_url'] = logoDom[0] except: print("sellerLogo error") # sellerName 卖家名字 try: nameDom = tree.xpath("//*[@id='sellerName']/text()") if (nameDom): data['name'] = nameDom[0].strip() except: print("sellerName error") # rating 卖家评级 try: ratingDom = tree.xpath( "//*[@id='seller-feedback-summary']//*[@class='a-icon-alt']/text()" ) if (ratingDom): data['rating'] = Model_Processor().formatRating( ratingDom[0], "it") except: print("seller rating error") try: feedbackDom = tree.xpath("//*[@id='feedback-summary-table']//tr") if (feedbackDom): # print (feedbackDom) for itemDom in feedbackDom: # print (itemDom.xpath("td[1]/text()")) typeDom = itemDom.xpath("td[1]/text()") if (typeDom): type = Model_Processor().formatType( typeDom[0].strip()).lower() # print (type) # feedback type thirty_days thirtyDaysDom = itemDom.xpath("td[2]/span/text()") if (thirtyDaysDom): # print (thirtyDaysDom[0].strip()) if (thirtyDaysDom[0].strip() != '-'): thirtyDaysDom = Model_Processor().formatNumber( thirtyDaysDom[0].strip(), "it") data['feedback_' + type + '_thirty_days'] = thirtyDaysDom else: data['feedback_' + type + '_thirty_days'] = "" # feedback type ninty_days nintyDaysDom = itemDom.xpath("td[3]/span/text()") if (nintyDaysDom): # print (nintyDaysDom[0].strip()) if (nintyDaysDom[0].strip() != '-'): nintyDaysDom = Model_Processor().formatNumber( nintyDaysDom[0].strip(), "it") data['feedback_' + type + '_ninty_days'] = nintyDaysDom else: data['feedback_' + type + '_ninty_days'] = "" # feedback type twelve_months twelveMonthsDom = itemDom.xpath("td[4]/span/text()") if (twelveMonthsDom): # print (twelveMonthsDom[0].strip()) if (twelveMonthsDom[0].strip() != '-'): twelveMonthsDom = Model_Processor( ).formatNumber(twelveMonthsDom[0].strip(), "it") data['feedback_' + type + '_twelve_months'] = twelveMonthsDom else: data['feedback_' + type + '_twelve_months'] = "" # feedback type lifetime lifetimeDom = itemDom.xpath("td[5]/span/text()") if (lifetimeDom): # print (lifetimeDom[0].strip()) if (lifetimeDom[0].strip() != '-'): lifetimeDom = Model_Processor().formatNumber( lifetimeDom[0].strip(), "it") data['feedback_' + type + '_lifetime'] = lifetimeDom else: data['feedback_' + type + '_lifetime'] = "" else: print("no feedback") except: print("feedback info error") if (len(data)): return data return False
def process(self, html, page_id=1): if html == '' or html == 'None': print "Can't get them html from https://www.amazon.com" sys.exit() tree = etree.HTML(html) data = [] # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=handbags try: # 处理中间产品 # listDom1 = tree.xpath('//*[@id="atfResults"]/ul/li') # listDom2 = tree.xpath('//*[@id="btfResults"]/ul/li') listDoms = tree.xpath( "//*[@class='a-row s-result-list-parent-container']/ul/li") # print (len(listDoms)) if (listDoms): page_position = 1 sponsor_position = 1 previous_sponsor_position_type = 'top' for itemDom in listDoms: item = { 'sponsor': 0, 'page_id': page_id, 'page_position': page_position } # 标记为sponsor的产品 sponsorDom = itemDom.xpath( "div//*[contains(@class, 's-sponsored-list-header')]/text()" ) if (sponsorDom): item['sponsor'] = 1 else: # a-color-tertiary sponsorDom = itemDom.xpath( "div//h5[contains(@class, 'a-color-tertiary')]/text()" ) if (sponsorDom): item['sponsor'] = 1 if (item['sponsor'] == 1): if (page_position <= 4): sponsor_position_type = 'top' else: sponsor_position_type = 'bottom' if (sponsor_position_type != previous_sponsor_position_type): sponsor_position = 1 previous_sponsor_position_type = 'bottom' item['sponsor_position_type'] = sponsor_position_type item['sponsor_position'] = sponsor_position sponsor_position += 1 page_position += 1 # asin try: asin = itemDom.xpath("@data-asin") if (asin): item['asin'] = asin[0] else: continue except: print("asin error") # 标题 title try: title = itemDom.xpath( "div//*[contains(@class, 's-access-detail-page')]/h2/text()" ) if (title): item['title'] = title[0] except: print("title error") # 图片 images try: try: image = itemDom.xpath( "div//*[contains(@class, 's-access-image')]/@src" ) if (image): item['image'] = Model_Processor().formatImage( image[0]) except: print("image error") # 宽度 try: width = itemDom.xpath( "div//*[contains(@class, 's-access-image')]/@width" ) if (width): item['width'] = width[0] except: print("width error") # 高度 try: height = itemDom.xpath( "div//*[contains(@class, 's-access-image')]/@height" ) if (height): item['height'] = height[0] except: print("height error") except: print("images error") # bestsellerDom try: bestsellerDom = itemDom.xpath( "div//*[contains(@class, 'sx-badge-rectangle')]/span/text()" ) if (bestsellerDom): if (bestsellerDom[0] == "Best Seller"): bestseller_node_id = itemDom.xpath( "div//*[contains(@class, 'sx-badge-region')]/div/a/@href" ) if (bestseller_node_id): bestseller_id = bestseller_node_id[ 0].strip().split("bestsellers/")[ 1].strip().split("/")[1].strip() if (bestseller_id.isdigit()): item[ 'bestseller_node_id'] = bestseller_id # print (item['bestseller_node_id']) except: print("bestseller error") # fba # 一般产品的xpath try: fba = itemDom.xpath( "div//*[contains(@class, 'a-icon-prime')]/span/text()" ) if (fba): if (fba[0].strip() == "Prime"): item['is_fba'] = "1" else: item['is_fba'] = "0" else: item['is_fba'] = "0" except: print("fba error") # 价格 price try: price = itemDom.xpath( "div//*[contains(@class, 'a-color-base')]/@aria-label" ) if (price): if (Model_Processor().formatNumber( price[0], "co.jp").isdigit()): item['price'] = Model_Processor().formatNumber( price[0], "co.jp") else: price = itemDom.xpath( "div//*[contains(@class, 'a-color-base')]/text()" ) if (price): if (Model_Processor().formatNumber( price[0], "co.jp").isdigit()): item['price'] = Model_Processor( ).formatNumber(price[0], "co.jp") else: price = itemDom.xpath( "div//*[contains(@class, 's-price')]/text()" ) if (price): if (Model_Processor().formatNumber( price[0], "co.jp").isdigit()): item['price'] = Model_Processor( ).formatNumber(price[0], "co.jp") except: print("price error") # 原价 list_price try: list_price = itemDom.xpath( "div//*[contains(@class, 'a-text-strike')]/text()") if (list_price): if (Model_Processor().formatNumber( list_price[0], "co.jp").isdigit()): item['list_price'] = Model_Processor( ).formatNumber(list_price[0], "co.jp") except: print("list_price error") # 评级 rating a-icon-star try: rating = itemDom.xpath( "div//*[contains(@class, 'a-icon-star')]/span/text()" ) if (rating): item['rating'] = Model_Processor().formatRating( rating[0], "co.jp") except: print("rating error") # 评论数 review_count //*[@id="result_0"]/div/div[3]/div[3]/a try: review_count = itemDom.xpath( "div//*[contains(@class, 'a-span5')]/div/a/text()") if (review_count): count = Model_Processor().formatNumber( review_count[0], "co.jp") if (count.isdigit()): item['review_count'] = count else: review_count = itemDom.xpath( "div//*[contains(@class, 'a-row a-spacing-none')]/a/text()" ) if (review_count): for i in review_count: if (i.replace(",", "").strip().isdigit()): count = Model_Processor().formatNumber( i.replace(",", "").strip(), "co.jp") if (count.isdigit()): item['review_count'] = count except: print("review_count error") # print (item) data.append(item) # 处理右侧广告 try: rightTitleDom = tree.xpath( "//*[@id='paRightContent']//h1/text()") if (rightTitleDom): if (rightTitleDom[0].strip() == "Sponsored"): rightListDom = tree.xpath( "//*[@id='paRightContent']//*[contains(@class, 'pa-ad-details')]" ) if (rightListDom): sponsor_position = 1 for itemDom in rightListDom: item = { 'sponsor': 1, 'sponsor_position_type': 'right', 'page_id': page_id, 'page_position': page_position, 'sponsor_position': sponsor_position } page_position += 1 sponsor_position += 1 # 获取广告ASIN linkDoms = itemDom.xpath("div/a/@href") if (linkDoms): for linkDom in linkDoms: item['asin'] = linkDom.split("%2F")[4] # if not item['asin']: # //*[@id="desktop-rhs-carousels_click_within_right"]/div/div[3]/div[2]/div[3]/a[1] if not item['asin']: linkDoms = itemDom.xpath( "div/div['a-section']/a") if (linkDoms): # for linkDom in linkDoms: if (linkDom.text): item[ 'review_count'] = linkDom.text item['asin'] = linkDom.xpath( "@href")[0].split("/")[4] else: # 获取评论数 review_countDom = itemDom.xpath( "div/div['a-section']/a[2]/text()") if (review_countDom): item[ 'review_count'] = Model_Processor( ).formatNumber( review_countDom[0], "co.jp") # print (review_countDom[0]) # item['review_count'] # 获取广告图片 try: imageDom = itemDom.xpath( "div//*[@alt='Product Details']/@src") if (imageDom): item['image'] = Model_Processor( ).formatImage(imageDom[0]) except: print("ad image error") # 获取广告标题 try: titleDom = itemDom.xpath( "div['a-section']/div/a/@title") if (titleDom): item['title'] = titleDom[0] except: print("ad title error") # 获取广告价格 try: priceDom = itemDom.xpath( "div//*/@aria-label") if (priceDom): if (Model_Processor().formatNumber( priceDom[0], "co.jp").isdigit()): item['price'] = Model_Processor( ).formatNumber( priceDom[0], "co.jp") except: print("ad price error") # 获取广告原价 try: list_priceDom = itemDom.xpath( "div//*[contains(@class, 'a-text-strike')]/text()" ) if (list_priceDom): if (Model_Processor().formatNumber( list_priceDom[0], "co.jp").isdigit()): item[ 'list_price'] = Model_Processor( ).formatNumber( list_priceDom[0], "co.jp") except: print("ad list_price error") # 获取rating a-icon-star try: ratingDom = itemDom.xpath( "div//*[contains(@class, 'a-icon-star')]/span/text()" ) if (ratingDom): item['rating'] = Model_Processor( ).formatRating(ratingDom[0], "co.jp") except: print("ad rating error") # print (item) data.append(item) except: print("right ad error") #计算总数 totalDom = tree.xpath("//*[@id='s-result-count']/text()") if (totalDom): # 总数 >1 # print (totalDom[0]) total = Model_Processor().formatNumber( totalDom[0].split(" ")[1].replace("件中", ""), "co.jp") if (total.isdigit()): total = {"total": total} data.append(total) else: # 总数 ==1 total = Model_Processor().formatNumber( totalDom[0].replace("件の結果", ""), "co.jp") if (total.isdigit()): total = {"total": total} data.append(total) # print (Model_Processor().formatNumber(totalDom[0].split(" ")[2])) else: # 总数为0 //*[@id="noResultsTitle"]/span[1] totalDom = tree.xpath("//*[@id='noResultsTitle']/span/text()") if (totalDom): if (totalDom[1] == "0" and totalDom[2] == "検索結果"): print totalDom[1] print totalDom[2] total = {"total": "0"} data.append(total) except Exception as err: print(err) # print (len(data)-1) # print (data[-1]) if (len(data) > 0): return data
def process(self, html, begin, end): if html == '' or html == 'None': print "Can't get them html from https://www.amazon.it" sys.exit() tree = etree.HTML(html) data = [] # print (html) for i in range(begin, end): reviewerDom = tree.xpath("//*[@id='reviewer" + str(i) + "']") if (reviewerDom): item = {} # 抓取rank //*[@id="reviewer1"]/td[1] try: rankDom = reviewerDom[0].xpath("td[1]/text()") if (rankDom): item['rank'] = Model_Processor().formatNumber( rankDom[0], "it") # print (item['rank']) except Exception as err: print(err) # 抓取profile_image_url try: imgDom = reviewerDom[0].xpath( "td[@class='img']/a/img/@src") if (imgDom): item['profile_image_url'] = Model_Processor( ).formatImage(imgDom[0]) # print (item['profile_image_url']) except: pass # 抓取id try: idDom = reviewerDom[0].xpath("td[3]/a/@href") if (idDom): item['top_reviewer_id'] = idDom[0].split( "/")[4].strip() # print (item['id']) except Exception as err: print(err) # 抓取name try: nameDom = reviewerDom[0].xpath("td[3]/a/b/text()") if (nameDom): item['name'] = nameDom[0].strip() # print (item['name']) except Exception as err: print(err) # 抓取review_count try: totalReviewsDom = reviewerDom[0].xpath("td[4]/text()") if (totalReviewsDom): item['review_count'] = Model_Processor().formatNumber( totalReviewsDom[0], "it") # print (item['review_count']) except Exception as err: print(err) # 抓取helpful_vote_count try: helpfulVotesDom = reviewerDom[0].xpath("td[5]/text()") if (helpfulVotesDom): item['helpful_vote_count'] = Model_Processor( ).formatNumber(helpfulVotesDom[0], "it") # print (item['helpful_vote_count']) except Exception as err: print(err) # 抓取helpful_vote_ratio try: percentHelpfulDom = reviewerDom[0].xpath("td[6]/text()") if (percentHelpfulDom): item['helpful_vote_ratio'] = Model_Processor( ).formatNumber(percentHelpfulDom[0], "it") # print (item['percent_helpful_vote']) except Exception as err: print(err) if (len(item) > 0): data.append(item) else: print("Pattern Mismatch: Dom [#reviewer" + str(i) + "] not found.") continue if (len(data) > 0): # print (len(data)) return data print("No available top reviewer data found.") return False
def process(self, html, page_id=1): if html == '' or html == 'None': print "Can't get them html from https://www.amazon.fr" sys.exit() tree = etree.HTML(html) data = [] # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=handbags try: # 处理中间产品 listDom1 = tree.xpath('//*[@id="atfResults"]/ul/li') listDom2 = tree.xpath('//*[@id="btfResults"]/ul/li') listDoms = tree.xpath("//*[@class='a-row s-result-list-parent-container']/ul/li") # print (len(listDoms)) if (listDoms): page_position = 1 sponsor_position = 1 previous_sponsor_position_type = 'top' for itemDom in listDoms: item = {'sponsor': 0, 'page_id': page_id, 'page_position': page_position} # 标记为sponsor的产品 sponsorDom = itemDom.xpath("div//*[contains(@class, 's-sponsored-list-header')]/text()") if (sponsorDom): item['sponsor'] = 1 else: # a-color-tertiary sponsorDom = itemDom.xpath("div//h5[contains(@class, 'a-color-tertiary')]/text()") if (sponsorDom): item['sponsor'] = 1 if (item['sponsor'] == 1): if (page_position <= 4): sponsor_position_type = 'top' else: sponsor_position_type = 'bottom' if (sponsor_position_type != previous_sponsor_position_type): sponsor_position = 1 previous_sponsor_position_type = 'bottom' item['sponsor_position_type'] = sponsor_position_type item['sponsor_position'] = sponsor_position sponsor_position += 1 page_position += 1 # asin try: asin = itemDom.xpath("@data-asin") if (asin): item['asin'] = asin[0] else: continue except: print ("asin error") # 标题 title try: title = itemDom.xpath("div//*[contains(@class, 's-access-detail-page')]/h2/text()") if (title): item['title'] = title[0] except: print ("title error") # 图片 images try: try: image = itemDom.xpath("div//*[contains(@class, 's-access-image')]/@src") if (image): item['image'] = Model_Processor().formatImage(image[0]) except: print ("image error") # 宽度 try: width = itemDom.xpath("div//*[contains(@class, 's-access-image')]/@width") if (width): item['width'] = width[0] except: print ("width error") # 高度 try: height = itemDom.xpath("div//*[contains(@class, 's-access-image')]/@height") if (height): item['height'] = height[0] except: print ("height error") except: print ("images error") # bestsellerDom try: bestsellerDom = itemDom.xpath("div//*[contains(@class, 'sx-badge-rectangle')]/span/text()") if (bestsellerDom): if (bestsellerDom[0] == "Best Seller"): bestseller_node_id = itemDom.xpath( "div//*[contains(@class, 'sx-badge-region')]/div/a/@href") if (bestseller_node_id): bestseller_id = bestseller_node_id[0].strip().split("bestsellers/")[1].strip().split("/")[1].strip() if (bestseller_id.isdigit()): item['bestseller_node_id'] = bestseller_id # print (item['bestseller_node_id']) except: print ("bestseller error") # fba # 一般产品的xpath try: fba = itemDom.xpath("div//*[contains(@class, 'a-icon-premium')]/span/text()") if (fba): if (fba[0].strip() == "Écran"): item['is_fba'] = "1" else: item['is_fba'] = "0" else: item['is_fba'] = "0" except: print ("fba error") # 价格 price try: price = itemDom.xpath("div//*[contains(@class, 'a-color-base')]/@aria-label") if (price): if (Model_Processor().formatNumber(price[0], "fr").isdigit()): item['price'] = Model_Processor().formatNumber(price[0], "fr") else: price = itemDom.xpath("div//*[contains(@class, 'a-color-base')]/text()") if (price): if (Model_Processor().formatNumber(price[0], "fr").isdigit()): item['price'] = Model_Processor().formatNumber(price[0], "fr") else: price = itemDom.xpath("div//*[contains(@class, 's-price')]/text()") if (price): if (Model_Processor().formatNumber(price[0], "fr").isdigit()): item['price'] = Model_Processor().formatNumber(price[0], "fr") except: print ("price error") # 原价 list_price try: list_price = itemDom.xpath("div//*[contains(@class, 'a-text-strike')]/text()") if (list_price): if (Model_Processor().formatNumber(list_price[0], "fr").isdigit()): item['list_price'] = Model_Processor().formatNumber(list_price[0], "fr") except: print ("list_price error") # 评级 rating a-icon-star try: rating = itemDom.xpath("div//i[contains(@class, 'a-icon-star')]/span/text()") if (rating): item['rating'] = Model_Processor().formatRating(rating[0], "fr") except: print ("rating error") # 评论数 review_count //*[@id="result_0"]/div/div[3]/div[3]/a try: review_count = itemDom.xpath("div//*[contains(@class, 'a-span5')]/div/a/text()") if (review_count): count = Model_Processor().formatNumber(review_count[0], "fr") if (count.isdigit()): item['review_count'] = count else: review_count = itemDom.xpath("div//*[contains(@class, 'a-spacing-top-mini')]/a/text()") if (review_count): count = Model_Processor().formatNumber(review_count[0], "fr") if (count.isdigit()): item['review_count'] = count except: print ("review_count error") # print (item) data.append(item) # if (listDom1): # page_position = 1 # sponsor_position = 1 # previous_sponsor_position_type = 'top' # # print (len(listDom1)) # for itemDom in listDom1: # item = {'sponsor': 0, 'page_id': 1, 'page_position': page_position} # # 标记为sponsor的产品 s-sponsored-list-header //*[@id="result_48"]/div/h5 //*[@id="result_24"]/div/div/div/div[a-fixed-left-grid-col a-col-right]/h5 # sponsorDom = itemDom.xpath("div//*[contains(@class, 's-sponsored-list-header')]/text()") # # sponsorDom = soup.select(".s-sponsored-list-header") # if (sponsorDom): # item['sponsor'] = 1 # else: # # a-color-tertiary # sponsorDom = itemDom.xpath("div//h5[contains(@class, 'a-color-tertiary')]/text()") # if (sponsorDom): # item['sponsor'] = 1 # if (item['sponsor'] == 1): # if (page_position <= 4): # sponsor_position_type = 'top' # else: # sponsor_position_type = 'bottom' # if (sponsor_position_type != previous_sponsor_position_type): # sponsor_position = 1 # previous_sponsor_position_type = 'bottom' # item['sponsor_position_type'] = sponsor_position_type # item['sponsor_position'] = sponsor_position # sponsor_position += 1 # page_position += 1 # # # asin # try: # asin = itemDom.xpath("@data-asin") # if (asin): # item['asin'] = asin[0] # # print (item['asin']) # # else: # # print ("no asin") # except: # print ("asin error") # # # 标题 title # try: # title = itemDom.xpath("div//*[contains(@class, 's-access-detail-page')]/h2/text()") # if (title): # item['title'] = title[0] # # print (title[0]) # # else: # # print ("no title") # except: # print ("title error") # # # 图片 images # try: # try: # image = itemDom.xpath("div//*[contains(@class, 's-access-image')]/@src") # if (image): # item['image'] = Model_Processor().formatImage(image[0]) # # else: # # print ("no image") # # print (item['image']) # except: # print ("image error") # # 宽度 # try: # width = itemDom.xpath("div//*[contains(@class, 's-access-image')]/@width") # if (width): # item['width'] = width[0] # # else: # # print ("no image width") # # print (item['width']) # except: # print ("width error") # # 高度 # try: # height = itemDom.xpath("div//*[contains(@class, 's-access-image')]/@height") # if (height): # item['height'] = height[0] # # else: # # print ("no image height") # # print (item['height']) # except: # print ("height error") # except: # print ("images error") # # # bestsellerDom # try: # bestsellerDom = itemDom.xpath("div//*[contains(@class, 'sx-badge-rectangle')]/span/text()") # if (bestsellerDom): # if (bestsellerDom[0] == "Best Seller"): # bestseller_node_id = itemDom.xpath("div//*[contains(@class, 'sx-badge-region')]/div/a/@href") # if (bestseller_node_id): # # /gp/bestsellers/electronics/2407761011/ref=sr_bs_19_2407761011_1 # # /gp/bestsellers/electronics/12557637011/ref=sr_bs_25_12557637011_1 # # /gp/bestsellers/electronics/15124502011/ref=sr_bs_27_15124502011_1 # bestseller_id = bestseller_node_id[0].strip().split("bestsellers/")[1].strip().split("/")[1].strip() # if (bestseller_id.isdigit()): # item['bestseller_node_id'] = bestseller_id # # print (item['bestseller_node_id']) # # else: # # print ("no bestseller") # except: # print ("bestseller error") # # # fba # # 一般产品的xpath # try: # fba = itemDom.xpath("div//*[contains(@class, 'a-icon-prime')]/span/text()") # if (fba): # if (fba[0].strip() == "Prime"): # item['is_fba'] = '1' # # print (item['fba']) # else: # item['is_fba'] = '0' # else: # item['is_fba'] = '0' # except: # print ("fba error") # # # 价格 price # try: # price = itemDom.xpath("div//*[contains(@class, 'a-color-base')]/@aria-label") # if (price): # item['price'] = Model_Processor().formatNumber(price[0], "de") # # print (item['price']) # # else: # # print ("no price") # except: # print ("price error") # # # 原价 list_price # try: # list_price = itemDom.xpath("div//*[contains(@class, 'a-text-strike')]/text()") # if (list_price): # item['list_price'] = Model_Processor().formatNumber(list_price[0], "de") # # print item['list_price'] # # else: # # print ("no list_price") # except: # print ("list_price error") # # # 评级 rating a-icon-star # try: # rating = itemDom.xpath("div//*[contains(@class, 'a-icon-star')]/span/text()") # if (rating): # item['rating'] = Model_Processor().formatRating(rating[0], "de") # # print item['rating'] # # else: # # print ("no rating") # except: # print ("rating error") # # # 评论数 review_count //*[@id="result_0"]/div/div[3]/div[3]/a # try: # review_count = itemDom.xpath("div//*[contains(@class, 'a-span5')]/div/a/text()") # if (review_count): # count = Model_Processor().formatNumber(review_count[0], "de") # if (count.isdigit()): # item['review_count'] = count # # print (item['review_count']) # else: # review_count = itemDom.xpath("div//*[contains(@class, 'a-spacing-top-mini')]/a/text()") # if (review_count): # count = Model_Processor().formatNumber(review_count[0], "de") # if (count.isdigit()): # item['review_count'] = count # # else: # # print ("no review_count") # except: # print ("review_count error") # # # print (item) # data.append(item) # # print (len(listDom2)) # if (listDom2): # page_position = page_position # sponsor_position = 1 # previous_sponsor_position_type = 'top' # for itemDom in listDom2: # item = {'sponsor': 0, 'page_id': 1, 'page_position': page_position} # # 标记为sponsor的产品 # sponsorDom = itemDom.xpath("div//*[contains(@class, 's-sponsored-list-header')]/text()") # # sponsorDom = soup.select("h5.s-sponsored-list-header") # # print (sponsorDom) # if (sponsorDom): # item['sponsor'] = 1 # else: # # a-color-tertiary # sponsorDom = itemDom.xpath("div//h5[contains(@class, 'a-color-tertiary')]/text()") # if (sponsorDom): # item['sponsor'] = 1 # if (item['sponsor'] == 1): # if (page_position <= 4): # sponsor_position_type = 'top' # else: # sponsor_position_type = 'bottom' # if (sponsor_position_type != previous_sponsor_position_type): # sponsor_position = 1 # previous_sponsor_position_type = 'bottom' # item['sponsor_position_type'] = sponsor_position_type # item['sponsor_position'] = sponsor_position # sponsor_position += 1 # page_position += 1 # # # asin # try: # asin = itemDom.xpath("@data-asin") # if (asin): # item['asin'] = asin[0] # # print (item['asin']) # # else: # # print ("no asin2") # except: # print ("asin2 error") # # # 标题 title # try: # title = itemDom.xpath("div//*[contains(@class, 's-access-detail-page')]/h2/text()") # if (title): # item['title'] = title[0] # # print (title[0]) # # else: # # print ("no title2") # except: # print ("title2 error") # # # 图片 images # try: # try: # image = itemDom.xpath("div//*[contains(@class, 's-access-image')]/@src") # if (image): # item['image'] = Model_Processor().formatImage(image[0]) # # else: # # print ("no image2") # # print (item['image']) # except: # print ("image2 error") # # 宽度 # try: # width = itemDom.xpath("div//*[contains(@class, 's-access-image')]/@width") # if (width): # item['width'] = width[0] # # else: # # print ("no image2 width") # # print (item['width']) # except: # print ("image2 width error") # # 高度 # try: # height = itemDom.xpath("//*[contains(@class, 's-access-image')]/@height") # if (height): # item['height'] = height[0] # # else: # # print ("no image2 height") # # print (item['height']) # except: # print ("image2 height error") # except: # print ("images2 error") # # # Bestseller # try: # bestsellerDom = itemDom.xpath("div//*[contains(@class, 'sx-badge-rectangle')]/span/text()") # if (bestsellerDom): # if (bestsellerDom[0] == "Best Seller"): # bestseller_node_id = itemDom.xpath( # "div//*[contains(@class, 'sx-badge-region')]/div/a/@href") # if (bestseller_node_id): # # /gp/bestsellers/electronics/2407761011/ref=sr_bs_19_2407761011_1 # # /gp/bestsellers/electronics/12557637011/ref=sr_bs_25_12557637011_1 # # /gp/bestsellers/electronics/15124502011/ref=sr_bs_27_15124502011_1 # bestseller_id = bestseller_node_id[0].strip().split("bestsellers/")[1].strip().split("/")[1].strip() # if (bestseller_id.isdigit()): # item['bestseller_node_id'] = bestseller_id # # print (item['bestseller_node_id']) # # else: # # print ("no bestseller2") # except: # print ("bestseller2 error") # # # fba # try: # fba = itemDom.xpath("div//*[contains(@class, 'a-icon-prime')]/span/text()") # if (fba): # if (fba[0].strip() == "Prime"): # item['is_fba'] = '1' # # print (item['fba']) # else: # item['is_fba'] = '0' # else: # item['is_fba'] = '0' # except: # print ("fba2 error") # # # 价格 price # try: # price = itemDom.xpath("div//*[contains(@class, 'a-color-base')]/@aria-label") # if (price): # item['price'] = Model_Processor().formatNumber(price[0], "de") # # print (item['price']) # else: # price = itemDom.xpath("div//*[contains(@class, 'a-color-base')]/text()") # if (price): # item['price'] = Model_Processor().formatNumber(price[0], "de") # # else: # # print ("no price2") # except: # print ("price2 error") # # # 原价 list_price # try: # list_price = itemDom.xpath("div//*[contains(@class, 'a-text-strike')]/text()") # if (list_price): # item['list_price'] = Model_Processor().formatNumber(list_price[0], "de") # # print item['list_price'] # # else: # # print ("no list_price2") # except: # print ("list_price2 error") # # # 评级 rating # try: # rating = itemDom.xpath("div//*[contains(@class, 'a-icon-star')]/span/text()") # if (rating): # item['rating'] = Model_Processor().formatRating(rating[0], "de") # # print item['rating'] # # else: # # print ("no rating2") # except: # print ("rating2 error") # # # 评论数 review_count # try: # review_count = itemDom.xpath("div//*[contains(@class, 'a-span5')]/div/a/text()") # try: # if (review_count): # count = Model_Processor().formatNumber(review_count[0], "de") # if (count.isdigit()): # item['review_count'] = count # # print (item['review_count']) # # else: # # print ("no review_count2") # else: # review_count = itemDom.xpath("div//*[contains(@class, 'a-spacing-top-mini')]/a/text()") # if (review_count): # count = Model_Processor().formatNumber(review_count[0], "de") # if (count.isdigit()): # item['review_count'] = count # # else: # # print ("no review_count2") # except Exception as err: # print (err) # except: # print ("review_count2 error") # # # print (item) # data.append(item) # 处理右侧广告 try: rightTitleDom = tree.xpath("//*[@id='paRightContent']//h1/text()") if (rightTitleDom): if (rightTitleDom[0].strip() == "Sponsored"): rightListDom = tree.xpath("//*[@id='paRightContent']//*[contains(@class, 'pa-ad-details')]") if (rightListDom): sponsor_position = 1 for itemDom in rightListDom: item = {'sponsor': 1, 'sponsor_position_type': 'right', 'page_id': page_id, 'page_position': page_position, 'sponsor_position': sponsor_position} page_position += 1 sponsor_position += 1 # 获取广告ASIN linkDoms = itemDom.xpath("div/a/@href") if (linkDoms): for linkDom in linkDoms: item['asin'] = linkDom.split("%2F")[4] # if not item['asin']: # //*[@id="desktop-rhs-carousels_click_within_right"]/div/div[3]/div[2]/div[3]/a[1] if not item['asin']: linkDoms = itemDom.xpath("div/div['a-section']/a") if (linkDoms): # for linkDom in linkDoms: if (linkDom.text): item['review_count'] = linkDom.text item['asin'] = linkDom.xpath("@href")[0].split("/")[4] else: # 获取评论数 review_countDom = itemDom.xpath("div/div['a-section']/a[2]/text()") if (review_countDom): item['review_count'] = Model_Processor().formatNumber(review_countDom[0], "fr") # 获取广告图片 try: imageDom = itemDom.xpath("div//*[@alt='Product Details']/@src") if (imageDom): item['image'] = Model_Processor().formatImage(imageDom[0]) except: print ("ad image error") # 获取广告标题 try: titleDom = itemDom.xpath("div['a-section']/div/a/@title") if (titleDom): item['title'] = titleDom[0] except: print ("ad title error") # 获取广告价格 try: priceDom = itemDom.xpath("div//*/@aria-label") if (priceDom): if (Model_Processor().formatNumber(priceDom[0], "fr").isdigit()): item['price'] = Model_Processor().formatNumber(priceDom[0], "fr") except: print ("ad price error") # 获取广告原价 try: list_priceDom = itemDom.xpath("div//*[contains(@class, 'a-text-strike')]/text()") if (list_priceDom): if (Model_Processor().formatNumber(list_priceDom[0], "fr").isdigit()): item['list_price'] = Model_Processor().formatNumber(list_priceDom[0], "fr") except: print ("ad list_price error") # 获取rating a-icon-star try: ratingDom = itemDom.xpath("div//*[contains(@class, 'a-icon-star')]/span/text()") if (ratingDom): item['rating'] = Model_Processor().formatRating(ratingDom[0], "fr") except: print ("ad rating error") # print (item) data.append(item) except: print ("right ad error") #计算总数 totalDom = tree.xpath("//*[@id='s-result-count']/text()") if (totalDom): # 总数 >1 # print (totalDom[0].split(" ")) try: total = Model_Processor().formatNumber(totalDom[0].split("sur")[1].strip().split("résultats")[0].replace(" ", "").strip(), "fr") except: total = Model_Processor().formatNumber(totalDom[0].split(" ")[0], "fr") # print (str(total).replace(" ", "")) total = str(total).replace("résultat", "").replace(" ", "") if (total.isdigit()): total = {"total": total} data.append(total) else: # 总数 ==1 total = Model_Processor().formatNumber(totalDom[0].split(" ")[0], "fr") if (total.isdigit()): total = {"total": total} data.append(total) # print (Model_Processor().formatNumber(totalDom[0].split(" ")[2])) else: # 总数为0 //*[@id="noResultsTitle"]/span[1] totalDom = tree.xpath("//*[@id='noResultsTitle']//span/text()") if (totalDom): if (totalDom[0] == "0" and totalDom[1] == "résultats"): print totalDom[0] print totalDom[1] total = {"total": "0"} data.append(total) except Exception as err: print (err) # print (len(data)-1) # print (data[-1]) if (len(data)>0): return data