def parse_color_item(self, response): sel = Selector(response) index = response.meta['index'] color_urls = response.meta['color_urls'] baseItem = response.meta['baseItem'] colorItem = Color() colorItem['type'] = 'color' colorItem['show_product_id'] = baseItem['show_product_id'] colorItem['from_site'] = self.name colorItem['name'] = color_urls[index]['color_name'] colorItem['cover'] = color_urls[index]['color_cover'] images = [] imageItem = ImageItem() image_url = sel.xpath('//meta[@property="og:image"]/@content').extract()[0] imageItem['image'] = re.sub(r'wid=\d+&hei=\d+', 'wid=1000&hei=1000', image_url) imageItem['thumbnail'] = re.sub(r'wid=\d+&hei=\d+', 'wid=50&hei=50', image_url) images.append(imageItem) image_url2 = sel.xpath('//div[@id="productSwatch"]/img/@src').extract()[0] imageItem = ImageItem() imageItem['image'] = re.sub(r'wid=\d+&hei=\d+', 'wid=1000&hei=1000', image_url2) imageItem['thumbnail'] = re.sub(r'wid=\d+&hei=\d+', 'wid=50&hei=50', image_url2) images.append(imageItem) colorItem['images'] = images yield colorItem skus = response.meta['skus'] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = baseItem['show_product_id'] skuItem['from_site'] = self.name skuItem['current_price'] = sel.xpath('//div[@class="prodprice saleprice"]/p/span[@itemprop="price"]/text()').extract()[0] if len(sel.xpath('//div[@class="prodprice saleprice"]/p/span[@class="basePrice"]/text()').extract()) > 0: skuItem['list_price'] = sel.xpath('//div[@class="prodprice saleprice"]/p/span[@class="basePrice"]/text()').extract()[0] else: skuItem['list_price'] = skuItem['current_price'] skuItem['is_outof_stock'] = False skuItem['color'] = color_urls[index]['color_name'] skuItem['size'] = 'one-size' skuItem['id'] = baseItem['show_product_id'] skus.append(skuItem) if index + 1 == len(color_urls): baseItem['skus'] = skus yield baseItem else: yield Request(color_urls[index+1]['url'], callback=self.parse_color_item , meta={'baseItem': baseItem, 'color_urls': color_urls, 'index': index+1, 'skus': skus})
def parse_color_sku(self, response): baseItem = response.meta['baseItem'] images_tmp = response.meta['images'] jsonStr = json.loads(response.body) colors = [] skus = [] sizes = [] for col in jsonStr['Colors']: images = [] for img in images_tmp: imageItem = ImageItem() imageItem['thumbnail'] = '%s%s_%s_%s.%s' % ( img['base'], col['Code10'], img['thum_size'], img['index'], img['thum_ext']) imageItem['image'] = '%s%s_%s_%s.%s' % ( img['base'], col['Code10'], img['img_size'], img['index'], img['img_ext']) images.append(imageItem) color = Color() color['type'] = 'color' color['from_site'] = 'thecorner' color['show_product_id'] = baseItem['show_product_id'] color['images'] = images color['name'] = col['Description'] color['cover_style'] = '#' + col['Rgb'] #color['cover_style'] = 'background-color: #%s;' % (col['Rgb']) colors.append(col['Description']) yield color for size in jsonStr['ModelColorSizes']: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = baseItem['show_product_id'] skuItem['from_site'] = 'thecorner' skuItem['id'] = size['Color']['Description'].encode( "utf-8") + "*" + size['Size']['Description'] skuItem['list_price'] = baseItem['list_price'] skuItem['current_price'] = baseItem['current_price'] skuItem['size'] = size['Size']['Description'] skuItem['color'] = size['Color']['Description'] skuItem['is_outof_stock'] = False skuItem['quantity'] = size['Quantity'] sizes.append(size['Size']['Description']) skus.append(skuItem) baseItem['skus'] = skus baseItem['colors'] = list(set(colors)) baseItem['sizes'] = list(set(sizes)) yield baseItem
def parse_sku_item(self, response): sel = Selector(response) item = response.meta['item'] index = response.meta['index'] sku_size_list = response.meta['sku_size_list'] sku_item_url_list = response.meta['sku_item_url_list'] content = demjson.decode(response.body) newSizeData = Selector(text=content['newSizeData']) if index >= len(sku_size_list): item['sizes'] = sku_size_list yield item else: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = item['show_product_id'] skuItem['color'] = u'one color' skuItem['size'] = sku_size_list[index] skuItem['id'] = content['prodId'] skuItem['from_site'] = self.name if len(newSizeData.xpath(".//div[@class='outOfStockMsg']")) > 0: skuItem['list_price'] = item['list_price'] skuItem['current_price'] = item['current_price'] skuItem['is_outof_stock'] = True else: skuItem['list_price'] = newSizeData.xpath( ".//table/tbody/tr[1]/td/text()").extract()[0].strip()[1:] skuItem['current_price'] = newSizeData.xpath( ".//td[@class='highlight']/text()").extract()[0].strip( )[1:] skuItem['is_outof_stock'] = False item['skus'].append(skuItem) index = index + 1 yield Request(sku_item_url_list[index], callback=self.parse_sku_item, meta={ "sku_size_list": sku_size_list, "sku_item_url_list": sku_item_url_list, "item": item, "index": index })
def handle_parse_item(self, response, baseItem): sel = Selector(response) baseItem['dimensions'] = ['size', 'color'] baseItem['desc'] = sel.xpath( '//div[@id="aDescriptionBody"]').extract()[0] baseItem['brand'] = sel.xpath( '//span[@itemprop="brand"]/text()').extract()[0] skus = [] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = baseItem['show_product_id'] skuItem['from_site'] = self.name skuItem['current_price'] = baseItem['current_price'] skuItem['list_price'] = baseItem['list_price'] skuItem['is_outof_stock'] = False skuItem['color'] = 'one-color' skuItem['size'] = 'one-size' skuItem['id'] = baseItem['show_product_id'] skus.append(skuItem) imageItem = ImageItem() imageItem['image'] = 'http:' + sel.xpath( '//img[@id="ImageUrl"]/@src').extract()[0] imageItem['thumbnail'] = imageItem['image'] images = [] images.append(imageItem) colorItem = Color() colorItem['type'] = 'color' colorItem['show_product_id'] = baseItem['show_product_id'] colorItem['from_site'] = self.name colorItem['images'] = images colorItem['name'] = 'one-color' yield colorItem baseItem['skus'] = skus yield baseItem
def handle_color_item(self, response): sel = Selector(response) item = response.meta['item'] image_color_dict = response.meta['image_color_dict'] colorUrls = response.meta['colorUrls'] index = response.meta['index'] skuItem = SkuItem() data_colorid = sel.xpath( ".//li[@class='selected']/a/@data-colorid").extract()[0] images = image_color_dict[data_colorid] colorItem = Color() colorItem['images'] = images colorItem['type'] = 'color' colorItem['from_site'] = self.name colorItem['show_product_id'] = sel.xpath( ".//span[@itemprop='productID']/text()").extract()[0] colorItem['name'] = sel.xpath( ".//span[@class='selected-value colorValueLabel']/text()").extract( )[0].strip() cover = sel.xpath(".//li[@class='selected']/a/@style").extract()[0] colorItem['cover'] = re.findall('\(.+\)', cover)[0][1:-1] item['colors'].append(colorItem['name']) yield colorItem skuItem['color'] = colorItem['name'] skuItem['type'] = 'sku' skuItem['show_product_id'] = sel.xpath( ".//span[@itemprop='productID']/text()").extract()[0] if len( sel.xpath( ".//span[@data-event-label='Full Price']/@data-event-value" )) > 0: skuItem['list_price'] = sel.xpath( ".//span[@data-event-label='Full Price']/@data-event-value" ).extract()[0].strip() skuItem['current_price'] = sel.xpath( ".//span[@data-event-label='Full Price']/@data-event-value" ).extract()[0].strip() else: skuItem['current_price'] = sel.xpath( ".//span[@data-event-label='Full Minimum Price']/@data-event-value" ).extract()[0].strip() skuItem['list_price'] = sel.xpath( ".//span[@data-event-label='Full Maximum Price']/@data-event-value" ).extract()[0].strip() skuItem['size'] = u'one size' skuItem['id'] = data_colorid skuItem['from_site'] = self.name sku_quantity = sel.xpath( ".//select[@name='Quantity']/@data-available").extract()[0] if sku_quantity == 0: skuItem['is_outof_stock'] = True else: skuItem['is_outof_stock'] = False item['skus'].append(skuItem) item['sizes'] = [u'one size'] index = index + 1 if index >= len(colorUrls): item['colors'] = list(set(item['colors'])) yield item else: yield Request(colorUrls[index], callback=self.handle_color_item, meta={ 'image_color_dict': image_color_dict, 'colorUrls': colorUrls, 'index': index, 'item': item })
def handle_parse_item(self, response, item): sel = Selector(response) item['show_product_id'] = re.search('(\d+)\.html', response.url).group(1) item['title'] = sel.xpath( '//div[@class="product-name"]/h1/text()').extract()[0].strip() brand = re.search('brand: \'(.+)\'', response.body) if not brand: item['brand'] = 'pharmacyonline' else: item['brand'] = brand.group(1) img = re.search('imgUrl: \'(.+)\'', response.body).group(1) images = [] imageItem = ImageItem() imageItem[ 'thumbnail'] = img + '?imageMogr2/thumbnail/380x380/extent/380x380/background/d2hpdGU=' imageItem['image'] = img images.append(imageItem) # item['cover'] = images[0]['thumbnail'] item['colors'] = ['One Color'] color = Color() color['type'] = 'color' color['from_site'] = item['from_site'] color['show_product_id'] = item['show_product_id'] color['images'] = images color['name'] = 'One Color' color['cover'] = images[0][ 'image'] + '?imageMogr2/thumbnail/100x100/extent/100x100/background/d2hpdGU=' yield color item['desc'] = sel.xpath( '//div[@class="product-collateral"]').extract()[0] current_price = sel.xpath( '//div[@class="DetailNoDis PriceNow last_price_sing"]/span/text()') if len(current_price) > 0: item['current_price'] = current_price.extract()[0] item['list_price'] = item['current_price'] else: item['current_price'] = sel.xpath( '//div[@class="DetailPriceContain clearfix"]//div[@class="PriceNow"]/text()' ).extract()[0].strip() item['list_price'] = sel.xpath( '//div[@class="DetailPriceContain clearfix"]//p[@class="PriceWas"]/text()' ).extract()[0].strip() skus = [] item['sizes'] = ['One Size'] skuItem = SkuItem() skuItem['type'] = "sku" skuItem['from_site'] = item['from_site'] sku_id = sel.xpath( '//div[@class="DetailSku"]/text()').extract()[0].strip() skuItem['id'] = re.search('(\d+)', sku_id).group(1) skuItem['show_product_id'] = item['show_product_id'] skuItem['current_price'] = item['current_price'] skuItem['list_price'] = item['list_price'] skuItem['size'] = 'One Size' skuItem['color'] = 'One Color' skus.append(skuItem) item['skus'] = skus item['dimensions'] = ['size'] if len(item['show_product_id']) > 6: product_id = item['show_product_id'][1:] else: product_id = item['show_product_id'] stock_url = 'http://cn.pharmacyonline.com.au/pt_catalog/index/checkQty?product_id=' + product_id yield Request(stock_url, callback=self.parse_stock, meta={"item": item}, dont_filter=True)
def handle_parse_item(self, response, item): match = re.search( r'<script type\=\"application\/json\">({"ProductDetails".+?)<\/script>', response.body) print match.group(1) sel = Selector(response) if match is None: return context = execjs.compile(''' var json = %s function getJson(){ return json; } ''' % match.group(1)) product_json = context.call('getJson') main_product = product_json['ProductDetails']['main_products'][0] item['brand'] = main_product['brand_name']['label'] item['title'] = main_product['short_description'] show_product_id = main_product['product_code'] item['show_product_id'] = show_product_id item['desc'] = main_product['description'] list_price = main_product['price']['list_price']['usd_currency_value'] if re.findall('\-', list_price): re.search('([\d\.]+)\s*\-', list_price).group(1) else: item['list_price'] = list_price sale_price = main_product['price']['sale_price']['usd_currency_value'] if re.findall('\-', sale_price): re.search('([\d\.]+)\s*\-', sale_price).group(1) else: item['current_price'] = sale_price item['dimensions'] = ['size'] skus = [] sizes = {} sizes['size'] = [] color_names = [] colors = main_product['colors']['colors'] handle_color_map = {} if len(colors) > 0: for color in colors: handle_color_map[color['id']] = color['label'] handle_size_map = {} if len(main_product['sizes']['sizes']) == 0: sizes['size'].append('onesize') else: for size in main_product['sizes']['sizes']: handle_size_map[size['id']] = size['value'] sizes['size'].append(size['value']) image_prefix = 'http:' + main_product['media'][ 'images_server_url'] + main_product['media']['images_path'] if len(colors) == 0: color_name = 'onecolor' color_names.append(color_name) common_images = main_product['media']['images'] images = [] for common_image in common_images: imageItem = ImageItem() imageItem[ 'image'] = image_prefix + common_image + '?wid=970&hei=1293&fmt=jpg' imageItem[ 'thumbnail'] = image_prefix + common_image + '?wid=396&hei=528&fmt=jpg' images.append(imageItem) first_thumbnail = images[0]['thumbnail'] colorItem = Color() colorItem['type'] = 'color' colorItem['from_site'] = item['from_site'] colorItem['show_product_id'] = item['show_product_id'] colorItem['images'] = images colorItem['name'] = color_name colorItem['cover'] = first_thumbnail colorItem['version'] = '1' yield colorItem else: common_images = main_product['media']['images'] for color in colors: color_name = color['label'] color_names.append(color_name) images = [] imageItem = ImageItem() imageItem['image'] = image_prefix + color[ 'colorize_image_url'] + '?wid=970&hei=1293&fmt=jpg' imageItem['thumbnail'] = image_prefix + color[ 'colorize_image_url'] + '?wid=396&hei=528&fmt=jpg' images.append(imageItem) first_thumbnail = images[0]['thumbnail'] for common_image in common_images: imageItem = ImageItem() imageItem[ 'image'] = image_prefix + common_image + '?wid=970&hei=1293&fmt=jpg' imageItem[ 'thumbnail'] = image_prefix + common_image + '?wid=396&hei=528&fmt=jpg' images.append(imageItem) colorItem = Color() colorItem['type'] = 'color' colorItem['from_site'] = item['from_site'] colorItem['show_product_id'] = item['show_product_id'] colorItem['images'] = images colorItem['name'] = color_name colorItem['version'] = '1' if len(color['value']) > 0: if re.findall('\#', color['value']): colorItem['cover_style'] = color['value'] else: cover_img_str = sel.xpath( '//li[@class="product-color-options__value" and @data-colorid=' + str(color["id"]) + ']/@style').extract() cover_unavi_str = sel.xpath( '//li[@class="product-color-options__value product-color-options__value--unavailable" and @data-colorid=' + str(color["id"]) + ']/@style').extract() cover_sel_str = sel.xpath( '//li[@class="product-color-options__value product-color-options__value--selected" and @data-colorid=' + str(color["id"]) + ']/@style').extract() cover_hid_str = sel.xpath( '//li[@class="product-color-options__value is-hidden" and @data-colorid=' + str(color["id"]) + ']/@style').extract() if len(cover_img_str) > 0: cover_img = re.search('\((.+)\)', cover_img_str[0]).group(1) colorItem['cover'] = 'http:' + cover_img elif len(cover_unavi_str) > 0: cover_img_str = cover_unavi_str[0] cover_img = re.search('\((.+)\)', cover_img_str).group(1) colorItem['cover'] = 'http:' + cover_img elif len(cover_sel_str) > 0: cover_img_str = cover_sel_str[0] cover_img = re.search('\((.+)\)', cover_img_str).group(1) colorItem['cover'] = 'http:' + cover_img elif len(cover_hid_str) > 0: cover_img_str = cover_hid_str[0] cover_img = re.search('\((.+)\)', cover_img_str).group(1) colorItem['cover'] = 'http:' + cover_img else: colorItem['cover'] = first_thumbnail else: colorItem['cover'] = first_thumbnail yield colorItem item['colors'] = color_names for sku in main_product['skus']['skus']: sku_id = sku['sku_id'] if sku_id == 'DUMMY': continue if sku['color_id'] == -1: color_name = 'onecolor' else: color_name = handle_color_map[sku['color_id']] if sku['size_id'] == -1: size = 'onesize' else: size = handle_size_map[sku['size_id']] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = item['show_product_id'] skuItem['from_site'] = item['from_site'] skuItem['id'] = sku_id skuItem['size'] = size skuItem['color'] = color_name if sku['status_alias'] == 'soldout' or sku[ 'status_alias'] == 'waitlist': skuItem['is_outof_stock'] = True else: skuItem['is_outof_stock'] = False if len(sku['price']['sale_price']['usd_currency_value']) > 0: skuItem['current_price'] = sku['price']['sale_price'][ 'usd_currency_value'] else: continue if len(sku['price']['list_price']['usd_currency_value']) > 0: skuItem['list_price'] = sku['price']['list_price'][ 'usd_currency_value'] else: continue skus.append(skuItem) item['sizes'] = sizes item['skus'] = skus if main_product['size_guide_link']['enabled'] == True: sizeInfo = main_product['size_guide_link']['url'] findQ = sizeInfo.find("?") if findQ != -1: item['size_info'] = sizeInfo[:findQ] else: item['size_info'] = sizeInfo yield item
def handle_parse_item(self, response, item): if re.match(r'^http:\/\/us\.asos\.com\/mp_sp\/',response.url): sel = Selector(response) url = sel.xpath('//li[@id="mp_li_cnti"]/a/@href').extract()[0] yield Request(url, callback=self.parse_item, cookies={'asos': 'currencyid=1'}, meta={'item': item}) else: skus=[] sel=Selector(response) json_info = re.search("view\(\'(.+\})\'\,", response.body) if not json_info: return else: json_info = json_info.group(1) json_info = "".join(json_info) json_info = json_info.decode("string-escape") goods_detail = json.loads(json_info) descs = sel.xpath('//div[@class="overflow-container"]/div/div') item['desc'] = '' for desc in descs: item['desc'] = item['desc'] + desc.extract() item['title'] = goods_detail['name'] if 'brandName' not in goods_detail.keys(): item['brand'] = 'asos' else: item['brand'] = goods_detail['brandName'] item['from_site'] = self.name if 'price' not in goods_detail.keys(): return item['current_price'] = goods_detail['price']['current'] if float(goods_detail['price']['previous']) != 0: item['list_price'] = goods_detail['price']['previous'] elif float(goods_detail['price']['rrp']) != 0: item['list_price'] = goods_detail['price']['rrp'] else: item['list_price'] = goods_detail['price']['current'] item['show_product_id'] = goods_detail['id'] sizes = [] colors = [] for sku in goods_detail['variants']: skuItem = SkuItem() skuItem['type'] = "sku" skuItem['from_site'] = self.name skuItem['is_outof_stock'] = False skuItem['id'] = sku['variantId'] skuItem['show_product_id'] = goods_detail['id'] skuItem['current_price'] = item['current_price'] skuItem['list_price'] = item['list_price'] skuItem['size'] = sku['size'] if sku['size'] not in sizes: sizes.append(sku['size']) skuItem['color'] = sku['colour'] if sku['colour'] not in colors: colors.append(sku['colour']) skus.append(skuItem) for color_name in colors: images = [] for image in goods_detail['images']: if image['colour'] == '' or (image['colour'] and color_name and len(image['colour']) == len(color_name) and (len(color_name) - difflib.SequenceMatcher(None,color_name,image['colour']).ratio()*len(color_name)) <=1): imageItem = ImageItem() imageItem['image'] = image['url'] + '?$XXL$' imageItem['thumbnail'] = image['url'] images.append(imageItem) color = Color() color['type'] = 'color' color['from_site'] = self.name color['show_product_id'] = goods_detail['id'] color['images'] = images color['name'] = color_name color['cover'] = images[0]['image'] yield color item['skus'] = skus item['sizes'] = list(set(sizes)) item['dimensions'] = ['size'] item['colors'] = colors related_products_url = 'http://us.asos.com/api/product/catalogue/v2/productgroups/ctl/' + str(item['show_product_id']) + '?store=US&store=US¤cy=USD' yield Request('http://us.asos.com/api/product/catalogue/v2/stockprice?productIds=' + str(goods_detail['id']) + '&store=US¤cy=USD', callback=self.parse_stock, meta={'item': item, 'related_products_url': related_products_url}) # color_size_str="".join(re.findall(r"var\s+arrSzeCol_ctl00_ContentMainPage_ctlSeparateProduct[^<]+", response.body)) # sep_image_str="".join(re.findall(r"var\s+arrSepImage_ctl00_ContentMainPage_ctlSeparateProduct[^<]+", response.body)) # thumb_image_str="".join(re.findall(r"var\s+arrThumbImage_ctl00_ContentMainPage_ctlSeparateProduct[^<]+", response.body)) # if len(color_size_str)>0: # context = execjs.compile(''' # %s # %s # %s # function get_color_size(){ # return arrSzeCol_ctl00_ContentMainPage_ctlSeparateProduct; # } # function get_sep_image(){ # return arrSepImage_ctl00_ContentMainPage_ctlSeparateProduct; # } # function get_thumb_image(){ # return arrThumbImage_ctl00_ContentMainPage_ctlSeparateProduct; # } # ''' % (color_size_str, sep_image_str, thumb_image_str)) # color_sizes = context.call('get_color_size') # sep_image= context.call('get_sep_image') # thumb_images = context.call('get_thumb_image') # #import pdb;pdb.set_trace() # if len(sel.xpath('//div[@id="ctl00_ContentMainPage_ctlSeparateProduct_pnlOutofStock"]').extract()) > 0: # return # # if len(sel.xpath('//span[@id="ctl00_ContentMainPage_ctlSeparateProduct_lblProductTitle"]/text()').extract()) > 0: # item['title']=sel.xpath('//span[@id="ctl00_ContentMainPage_ctlSeparateProduct_lblProductTitle"]/text()').extract()[0] # # data_dic_str = sel.xpath('//script[@id="dataDictionary"]/text()') # # product_data_str=data_dic_str.re(r'^var Product\s*=\s*({.*?});')[0] # product_data=eval(product_data_str) # item['show_product_id']=product_data['ProductIID'] # desc=sel.xpath('//div[@id="ctl00_ContentMainPage_productInfoPanel"]//ul') # if len(desc)>0: # item['desc']=desc.extract()[0] # item['brand']=product_data['ProductBrand'] # item['from_site']=self.name # # '''有严重问题,注释掉了''' # # gender_category_str=product_data['ProductCategory'] # # m=re.search(r'(.+)\|(.+)', gender_category_str) # # if m: # # item['gender']=m.group(1).strip() # # m=re.search(r'(.+)\|(.+)', gender_category_str) # # if m: # # item['category']=m.group(2).strip() # # sku_data_str = data_dic_str.re(r'var ProductChildSkuInfo\s*=\s*({.*?});')[0] # sku_data=eval(sku_data_str) # sku_data_list=sku_data['ChildSkuInfo'][item['show_product_id']] # #color_list=sel.xpath('//select[@id="ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour"]').extract() # if color_sizes: # '''handle color and image''' # # # thumbnail_lis=sel.xpath('//ul[@class="productThumbnails"]//li//img/@src') # # image_lis=sel.xpath('//div[@id="productImages"]//img/@src') # # if len(thumbnail_lis)>0: # # for i in range(len(thumbnail_lis)): # # imageItem=ImageItem() # # imageItem['image']=image_lis[i].extract() # # imageItem['thumbnail']=thumbnail_lis[i].extract() # # images.append(imageItem) # #left three imageItem # images=[] # for thumb_image in thumb_images: # imageItem=ImageItem() # imageItem['image']=thumb_image[2] # imageItem['thumbnail']=thumb_image[0] # images.append(imageItem) # # item_color_names=[] # #all color names of item # # sep_image_dict = {} # for sep_image_arr in sep_image: # key = sep_image_arr[3] # sep_image_dict[key] = {'image': sep_image_arr[2], 'thumbnail': sep_image_arr[0]} # # color_names = sel.xpath('//div[@id="ctl00_ContentMainPage_ctlSeparateProduct_pnlColour"]//option/@value')[1:].extract() # for color_name in color_names: # # lower_color_name = color_name.lower() # if '/' in lower_color_name: # lower_color_name_2 = lower_color_name.replace('/', '') # else: # lower_color_name_2 = lower_color_name # if lower_color_name not in sep_image_dict.keys() and lower_color_name_2 not in sep_image_dict.keys(): # return # imageItem=ImageItem() # imageItem['thumbnail']= sep_image_dict[lower_color_name_2]['thumbnail'] # imageItem['image']= sep_image_dict[lower_color_name_2]['image'] # images.insert(0, imageItem) # # import pdb;pdb.set_trace() # color=Color() # color['type'] ='color' # color['from_site'] = self.name # color['show_product_id'] = product_data['ProductIID'] # color['images'] = images # color['name'] = color_name # color['cover'] = sep_image_dict[lower_color_name_2]['thumbnail'] # # yield color # # item_color_names.append(color_name) # '''handle price''' # #list_price_sel=sel.xpath('//span[@id="ctl00_ContentMainPage_ctlSeparateProduct_lblRRP"]') # sizes=[] # for color_size in color_sizes: # size_id = color_size[0] # size = color_size[1] # if not size.strip(): # size = 'onesize' # # if color_size[3] == "False": # continue # # original_color_name = color_size[2] # for color_name in item_color_names: # tmp_color_name = re.sub(r'[^\w]', '', color_name) # # if tmp_color_name == original_color_name: # original_color_name = color_name # # skuItem=SkuItem() # skuItem['type']="sku" # skuItem['from_site']=self.name # skuItem['is_outof_stock']=False # skuItem['id']=sku_data_list[str(size_id)+original_color_name]['Sku'] # #skuItem['id']=color_size[0] # skuItem['show_product_id']=product_data['ProductIID'] # skuItem['current_price']= color_size[5] # # if color_size[6] == color_size[5] and color_size[8] != '0' and color_size[8] != '0.00': # skuItem['list_price']= color_size[8] # else: # skuItem['list_price']= color_size[6] # # sizes.append(size) # skuItem['color'] = original_color_name # skuItem['size'] = size # skus.append(skuItem) # # item['skus']=skus # item['sizes']=list(set(sizes)) # item['dimensions']=['size'] # item['colors'] = item_color_names # size_info = sel.xpath('//a[@id="ctl00_ContentMainPage_SizeGuideButton_SizeGuideLink"]/@href') # if size_info: # item['size_info'] = size_info.extract()[0] # if not re.match(r'^http', size_info.extract()[0]): # item['size_info'] = self.base_url + size_info.extract()[0] # yield item
def handle_parse_item(self, response, item): sel = Selector(response) item['show_product_id'] = str( re.search('productID: \"(\d+)\"', response.body).group(1)).strip() item['brand'] = str( re.search('productBrand: \"(.+)\"', response.body).group(1)).strip() item['title'] = sel.xpath( './/h1[@data-track="product-title"]/text()').extract()[0].strip() item['desc'] = ''.join( sel.xpath('//div[@itemprop="description"]/p').extract()).strip() item['current_price'] = sel.xpath( '//span[@class="price"]/text()').extract()[0].strip() list_price_search = re.search('rrp: .+\&\#36\;([\d\.]+).+', response.body) if list_price_search: item['list_price'] = list_price_search.group(1) else: item['list_price'] = item['current_price'] images = [] image_divs = sel.xpath( '//div[@class="product-thumb-box productImageZoom__thumbnailContainer "]' ) if not image_divs: return for image_div in image_divs: imageItem = ImageItem() imageItem['thumbnail'] = image_div.xpath('./img/@src').extract()[0] imageItem['image'] = image_div.xpath( './parent::*/@href').extract()[0] images.append(imageItem) color_names = sel.xpath( '//select[@id="opts-2"]/option[position()>1]/text()').extract() if len(color_names) > 1: return if not color_names: color_names = ['One Color'] item['colors'] = color_names color = Color() color['type'] = 'color' color['from_site'] = item['from_site'] color['show_product_id'] = item['show_product_id'] color['images'] = images color['name'] = color_names[0] color['cover'] = images[0]['thumbnail'] yield color skus = [] sizes = sel.xpath( '//select[@id="opts-1"]/option[position()>1]/text()').extract() if not sizes: sizes = ['One Size'] item['sizes'] = sizes for size in sizes: for color_name in color_names: skuItem = SkuItem() skuItem['type'] = "sku" skuItem['from_site'] = item['from_site'] skuItem['id'] = item[ 'show_product_id'] + '-' + color_name + '-' + size skuItem['show_product_id'] = item['show_product_id'] skuItem['current_price'] = item['current_price'] skuItem['list_price'] = item['list_price'] skuItem['size'] = size skuItem['color'] = color_name skus.append(skuItem) item['skus'] = skus item['dimensions'] = ['size'] yield item
def handle_parse_item(self, response, item): sel = Selector(response) if len(sel.xpath('//form[@id="product-form"]//meta').extract()) > 1: return if len(sel.xpath('//div[@class="sold-out-details"]')) > 0: return item['show_product_id'] = sel.xpath( '//div[@class="product-code"]/span/text()').extract()[0].strip() imgs = sel.xpath( '//div[@class="container-imagery"]//ul[@class="thumbnails no-carousel"]/li/img/@src' ).extract() if len(imgs) == 0: imgs = sel.xpath( '//div[@class="container-imagery"]//ul[@class="swiper-wrapper"]/li/img/@src' ).extract() images = [] for img in imgs: if 'http:' not in img: img = 'http:' + img if 'xs.jpg' in img: img = img.replace('xs.jpg', 'pp.jpg') imageItem = ImageItem() imageItem['image'] = img imageItem['thumbnail'] = img.replace('pp.jpg', 'm.jpg') images.append(imageItem) colorItem = Color() colorItem['images'] = images colorItem['type'] = 'color' colorItem['from_site'] = item['from_site'] colorItem['show_product_id'] = item['show_product_id'] colorItem['name'] = 'One Color' colorItem['cover'] = images[0]['image'].replace('pp.jpg', 'xs.jpg') # colorItem['cover'] = images[0]['image'].split('_')[0] + '_sw.jpg' # print colorItem['cover'] # req = requests.get(colorItem['cover']) # if not req.ok: # colorItem['cover'] = images[0]['image'].replace('pp.jpg', 'xs.jpg') yield colorItem price = int( sel.xpath('//form[@id="product-form"]/meta/@data-price-full'). extract()[0]) / 100 if len(sel.xpath('//select-dropdown[@class="sku"]/@options')) > 0: sku_str = sel.xpath( '//select-dropdown[@class="sku"]/@options').extract()[0] skus = json.loads(sku_str) item['skus'] = [] sizes = [] for sku in skus: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = item['show_product_id'] skuItem['list_price'] = price skuItem['current_price'] = price skuItem['color'] = 'One Color' skuItem['size'] = sku['data']['size'] sizes.append(sku['data']['size']) skuItem['id'] = sku['id'] skuItem['from_site'] = item['from_site'] if sku['stockLevel'] == 'In_Stock' or sku[ 'stockLevel'] == 'Low_Stock': skuItem['is_outof_stock'] = False else: skuItem['is_outof_stock'] = True item['skus'].append(skuItem) else: item['skus'] = [] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = item['show_product_id'] skuItem['list_price'] = price skuItem['current_price'] = price skuItem['color'] = 'One Color' sizes = ['One Size'] skuItem['size'] = 'One Size' skuItem['id'] = sel.xpath( '//input [@class="sku"]/@value').extract()[0] stock_level = sel.xpath( '//input [@class="sku"]/@data-stock').extract()[0] if stock_level == 'In_Stock' or stock_level == 'Low_Stock': skuItem['is_outof_stock'] = False else: skuItem['is_outof_stock'] = True skuItem['from_site'] = item['from_site'] item['skus'].append(skuItem) item['gender'] = self.gender item['colors'] = ['One Color'] item['sizes'] = sizes item['desc'] = '' if len(sel.xpath('//widget-show-hide[@id="accordion-1"]//ul/li')) > 0: item['desc'] = item['desc'] + sel.xpath( '//widget-show-hide[@id="accordion-1"]//ul/li').extract()[0] if len(sel.xpath('//widget-show-hide[@id="accordion-2"]//ul/li')) > 0: item['desc'] = item['desc'] + sel.xpath( '//widget-show-hide[@id="accordion-2"]//ul/li').extract()[0] if len(sel.xpath('//widget-show-hide[@id="accordion-2"]//p')) > 0: item['desc'] = item['desc'] + sel.xpath( '//widget-show-hide[@id="accordion-2"]//p').extract()[0] product_items = sel.xpath( '//widget-show-hide[@name="Editor\'s Notes"]/div[@class="show-hide-content"]/div/p/a' ) if len(product_items) > 0: related_items_id = [] for product_item in product_items: product_id = product_item.xpath('./@href').extract()[0].split( '/')[-1] related_items_id.append(product_id) if related_items_id: item['related_items_id'] = related_items_id media_url = 'https://video.net-a-porter.com/videos/productPage/' + item[ 'show_product_id'] + '_detail.mp4' try: req = requests.head(media_url) if req.ok: item['media_url'] = media_url except Exception as e: logging.error('error media url: ' + media_url + ' error msg: ' + str(e)) yield item
def parse_color_item(self, response): sel = Selector(response) baseItem = response.meta['baseItem'] skus = response.meta['skus'] color_data = response.meta['color_data'] index = response.meta['index'] images = [] thumbnail_lis = sel.xpath('//ul[@id="thumbnail-carousel"]/li') for thumbnail_li in thumbnail_lis: imageItem = ImageItem() thumbnail = thumbnail_li.xpath( './/img[contains(@class, "productthumbnail")]/@src').extract() if len(thumbnail) > 0: imageItem['thumbnail'] = thumbnail[0] imageItem['image'] = thumbnail_li.xpath('./a/@href').extract()[0] images.append(imageItem) colorItem = Color() colorItem['show_product_id'] = baseItem['show_product_id'] colorItem['type'] = 'color' colorItem['from_site'] = 'katespade' colorItem['name'] = color_data[index]['name'] colorItem['cover'] = color_data[index]['cover'] colorItem['images'] = images yield colorItem sizes_tmp = sel.xpath( '//ul[contains(@class, "swatches size")]/li[@class="emptyswatch"]/a/text()' ).re('(.+)') if len(sizes_tmp) == 0: sizes = ['one-size'] else: sizes = sizes_tmp for size in sizes: skuItem = SkuItem() skuItem['show_product_id'] = sel.xpath( '//input[@id="pid"]/@value').extract()[0] skuItem['type'] = 'sku' skuItem['from_site'] = 'katespade' skuItem['id'] = colorItem['name'] + '-' + size skuItem['current_price'] = sel.xpath( './/span[@class="price-sales"]/text()').extract()[0] skuItem['list_price'] = baseItem['list_price'] skuItem['is_outof_stock'] = False skuItem['color'] = colorItem['name'] skuItem['size'] = size skus.append(skuItem) index = index + 1 if (index) == len(color_data): baseItem['skus'] = skus yield baseItem else: color_item_url = color_data[index]['url'] yield Request(color_item_url, callback=self.parse_color_item, meta={ 'baseItem': baseItem, 'skus': skus, 'color_data': color_data, 'index': index })
def handle_parse_item(self, response, baseItem): sel = Selector(response) # baseItem = response.meta['baseItem'] if len(sel.xpath('//input[@id="pid"]/@value')) > 0: product_id = sel.xpath('//input[@id="pid"]/@value').extract()[0] else: return if len(sel.xpath('//p[@class="not-available-msg out-of-stock"]')) > 0: return if len(sel.xpath('//span[contains(@class, "price-standard")]')) > 0: baseItem['list_price'] = sel.xpath( './/span[@class="price-standard"]/text()').extract()[0] if len(sel.xpath('.//span[@class="price-sales"]/text()')) > 0: baseItem['current_price'] = sel.xpath( './/span[@class="price-sales"]/text()').extract()[0] else: current_price = sel.xpath( './/span[@class="price-sales range-sale-price"]/text()' ).extract()[0] if '-' in current_price: current_price = re.search('-\s*\$([\d\.]+)', current_price).group(1) baseItem['current_price'] = current_price else: if len(sel.xpath('.//span[@class="price-sales"]/text()')) > 0: baseItem['list_price'] = sel.xpath( './/span[@class="price-sales"]/text()').extract()[0] baseItem['current_price'] = baseItem['list_price'] else: if len(sel.xpath('.//span[@class="price-sales"]/text()')) > 0: baseItem['current_price'] = sel.xpath( './/span[@class="price-sales"]/text()').extract()[0] else: current_price = sel.xpath( './/div[@class="product-price sale"]/div/text()' ).extract()[0].strip() if '-' in current_price: current_price = re.search('-\s*\$([\d\.]+)', current_price).group(1) baseItem['current_price'] = current_price baseItem['list_price'] = baseItem['current_price'] baseItem['show_product_id'] = product_id baseItem['dimensions'] = ['size', 'color'] baseItem['brand'] = 'katespade' desc_list = sel.xpath('//div[@class="description-details"]').extract() if len(desc_list) == 0: baseItem['desc'] = sel.xpath( '//div[@class="description-details one-column"]').extract()[0] else: baseItem['desc'] = desc_list[0] sizes_tmp = sel.xpath( '//ul[contains(@class, "swatches size")]/li[@class="emptyswatch"]/a/text()' ).re('(.+)') if len(sizes_tmp) == 0: sizes = ['one-size'] else: sizes = sizes_tmp baseItem['sizes'] = sizes skus = [] images = [] thumbnail_lis = sel.xpath('//ul[@id="thumbnail-carousel"]/li') for thumbnail_li in thumbnail_lis: imageItem = ImageItem() thumbnail = thumbnail_li.xpath( './/img[contains(@class, "productthumbnail")]/@src').extract() if len(thumbnail) > 0: imageItem['thumbnail'] = thumbnail[0] imageItem['image'] = thumbnail_li.xpath('./a/@href').extract()[0] images.append(imageItem) color_lis = sel.xpath( '//ul[contains(@class, "swatches Color clearfix")]/li') if len(color_lis) > 1: color_lis = color_lis[:-1] if len(color_lis) == 0: color_lis = ['one-color-li'] baseItem['colors'] = ['one-color'] for color_li in color_lis: colorItem = Color() colorItem['show_product_id'] = product_id colorItem['type'] = 'color' colorItem['from_site'] = 'katespade' colorItem['name'] = 'one-color' colorItem['images'] = images yield colorItem for size in sizes: skuItem = SkuItem() skuItem['show_product_id'] = product_id skuItem['type'] = 'sku' skuItem['from_site'] = 'katespade' skuItem['id'] = colorItem['name'] + '-' + size skuItem['current_price'] = baseItem['current_price'] skuItem['list_price'] = baseItem['list_price'] skuItem['is_outof_stock'] = False skuItem['color'] = colorItem['name'] skuItem['size'] = size skus.append(skuItem) baseItem['skus'] = skus yield baseItem else: baseItem['colors'] = color_lis.xpath( './/span[@class="title"]/text()').extract() # for color_li in color_lis: colorItem = Color() colorItem['show_product_id'] = product_id colorItem['type'] = 'color' colorItem['from_site'] = 'katespade' color_selected = sel.xpath( '//ul[contains(@class, "swatches Color clearfix")]/li[@class="selected"]' ) if len(color_selected) == 0: colorItem['name'] = sel.xpath( '//ul[contains(@class, "swatches Color clearfix")]/li' )[0].xpath('./span[@class="title"]/text()').extract()[0] colorItem['cover'] = sel.xpath( '//ul[contains(@class, "swatches Color clearfix")]/li' )[0].xpath('./a/img/@src').extract()[0] else: colorItem['name'] = sel.xpath( '//ul[contains(@class, "swatches Color clearfix")]/li[@class="selected"]' )[0].xpath('./span[@class="title"]/text()').extract()[0] colorItem['cover'] = sel.xpath( '//ul[contains(@class, "swatches Color clearfix")]/li[@class="selected"]' )[0].xpath('./a/img/@src').extract()[0] colorItem['images'] = images yield colorItem for size in sizes: skuItem = SkuItem() skuItem['show_product_id'] = product_id skuItem['type'] = 'sku' skuItem['from_site'] = 'katespade' skuItem['id'] = colorItem['name'] + '-' + size skuItem['current_price'] = baseItem['current_price'] skuItem['list_price'] = baseItem['list_price'] skuItem['is_outof_stock'] = False skuItem['color'] = colorItem['name'] skuItem['size'] = size skus.append(skuItem) color_lis_not_selected = sel.xpath( '//ul[contains(@class, "swatches Color clearfix")]/li[@class="emptyswatch"]' ) if len(color_lis_not_selected) == 0 or ( len(color_lis_not_selected) == 1 and len(color_selected) == 0): baseItem['skus'] = skus yield baseItem else: # for color_li_not_selected in color_lis_not_selected: color_item_url = color_lis_not_selected[0].xpath( './a/@href').extract()[0] color_data = [] for color_li in color_lis_not_selected: color_data.append({ 'name': color_li.xpath( './span[@class="title"]/text()').extract()[0], 'cover': color_li.xpath('./a/img/@src').extract()[0], 'url': color_li.xpath('./a/@href').extract()[0] }) index = 0 yield Request(color_item_url, callback=self.parse_color_item, meta={ 'baseItem': baseItem, 'skus': skus, 'color_data': color_data, 'index': index })
def handle_parse_item(self, response, item): sel = Selector(response) outof_stock_content = sel.xpath( '//div[@class="item-availability"]/span[@class="out-of-stock"]' ).extract() if len(outof_stock_content) > 0: return title = sel.xpath( '//div[contains(@class, "product-name")]//span/text()').extract( )[0] show_product_id = sel.xpath( '//div[contains(@class, "no-display")]//input[1]/@value').extract( )[0] desc_tmp = sel.xpath( '//div[contains(@class, "tab-content")]').extract() item['type'] = 'base' item['title'] = title item['show_product_id'] = show_product_id item['brand'] = 'Rebecca Minkoff' if len(desc_tmp) > 1: item['desc'] = '%s%s' % (desc_tmp[0], desc_tmp[1]) else: item['desc'] = desc_tmp[0] if sel.xpath( '//div[contains(@class, "price-box")]//span[contains(@class, "regular-price")]' ): item['list_price'] = sel.xpath( '//span[contains(@class, "regular-price")]//span/text()' ).extract()[0] item['current_price'] = item['list_price'] else: item['list_price'] = sel.xpath( '//p[contains(@class, "old-price")]//span[2]/text()').extract( )[0] item['current_price'] = sel.xpath( '//p[contains(@class, "special-price")]//span[2]/text()' ).extract()[0] #### if sel.xpath('//div[contains(@class, "product-options")]'): jsStr = "".join( re.findall( r'<script type="text/javascript">[\s]*(var spConfig.*;)[\s]*</script>[\s]*<script type="text/javascript">[\s]*\/\/', response.body, re.S)) strInfo = "".join(re.findall(r'({.*})', jsStr, re.S)) strJson = json.loads(strInfo) attributeID = sel.xpath( '//dd//div[contains(@class, "input-box")]//select/@id' ).extract() colorID = attributeID[0].replace("attribute", "") col_name = {} colors = [] if colorID not in strJson['attributes'].keys(): return for col in strJson['attributes'][colorID]['options']: color_id = col['id'] name = col['label'] color = Color() for productID in col['products']: col_name[productID] = name images = [] first_thumb = '' for img in strJson['swatchImages'][col['products'] [0]]['galleryImages']: imageItem = ImageItem() imageItem['image'] = img['url'] imageItem['thumbnail'] = img['thumb'] images.append(imageItem) if len(first_thumb) == 0: first_thumb = img['thumb'] if col['swatch']['img']: color['cover'] = col['swatch']['img'] elif col['swatch']['hex']: #color['cover_style'] = 'background-color: #%s;' % (col['swatch']['hex']) color['cover_style'] = '#' + col['swatch']['hex'] else: color['cover'] = first_thumb colors.append(name) color['type'] = 'color' color['show_product_id'] = show_product_id color['from_site'] = 'rebeccaminkoff' #color['cover'] = cover color['images'] = images color['name'] = name yield color skus = [] sizes = [] if len(attributeID) > 1: sizeID = attributeID[1].replace("attribute", "") for skuCol in strJson['attributes'][sizeID]['options']: for sku_tmp in skuCol['products']: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = show_product_id skuItem['from_site'] = "rebeccaminkoff" skuItem['id'] = sku_tmp skuItem['list_price'] = strJson['oldPrice'] skuItem['current_price'] = strJson['basePrice'] skuItem['size'] = skuCol['label'] print col_name if sku_tmp not in col_name: continue skuItem['color'] = col_name[sku_tmp] skuItem['is_outof_stock'] = False #skuItem['quantity'] = '' sizes.append(skuCol['label']) skus.append(skuItem) else: skus = [] sizes = ['onesize'] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = show_product_id skuItem['from_site'] = 'rebeccaminkoff' skuItem['id'] = show_product_id skuItem['list_price'] = item['list_price'] skuItem['current_price'] = item['current_price'] skuItem['size'] = 'onesize' skuItem['color'] = "onecolor" skuItem['is_outof_stock'] = False skus.append(skuItem) item['skus'] = skus item['sizes'] = list(set(sizes)) item['colors'] = list(set(colors)) yield item else: images = [] for img in sel.xpath( '//ul[contains(@class, "product-image-thumbs")]//li'): imageItem = ImageItem() img_tmp = img.xpath('.//a//img/@src').extract()[0] imageItem['image'] = img_tmp imageItem['thumbnail'] = img_tmp.replace( '/thumbnail/', '/thumbnail/60x90/') images.append(imageItem) color = Color() color['type'] = 'color' color['show_product_id'] = show_product_id color['from_site'] = 'rebeccaminkoff' color['cover'] = images[0]['thumbnail'] color['images'] = images color['name'] = 'onecolor' yield color skus = [] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = show_product_id skuItem['from_site'] = 'rebeccaminkoff' skuItem['id'] = show_product_id skuItem['list_price'] = item['list_price'] skuItem['current_price'] = item['current_price'] skuItem['size'] = 'onesize' skuItem['color'] = "onecolor" skuItem['is_outof_stock'] = False skus.append(skuItem) item['skus'] = skus item['sizes'] = ['onesize'] item['colors'] = ['onecolor'] yield item
def handle_parse_item(self, response, baseItem): sel = Selector(response) if len(sel.xpath('//table[@id="TblProdForkPromo"]/tr').extract()) > 0: baseItem['desc'] = '<table>' + sel.xpath('//table[@id="TblProdForkPromo"]/tr').extract()[0] + '</table>' else: baseItem['desc'] = '' baseItem['dimensions'] = ['size', 'color'] baseItem['sizes'] = ['one-size'] color_lis = sel.xpath('//dl[@id="color"]//li') if len(color_lis) > 0: color_urls = [] colors = [] for color_li in color_lis: color_item_uri = color_li.xpath('./a/@href').extract()[0] color_url = self.base_url + color_item_uri color_name = color_li.xpath('./a/div[@class="distinctionName"]/text()').extract()[0] colors.append(color_name) color_cover = color_li.xpath('./a/div/img/@src').extract()[0] color_urls.append({'url': color_url, 'color_name': color_name, 'color_cover': color_cover}) baseItem['colors'] = colors yield Request(color_urls[0]['url'], callback=self.parse_color_item , meta={'baseItem': baseItem, 'color_urls': color_urls, 'index': 0, 'skus': []}) else: baseItem['colors'] = ['one-color'] skus = [] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = baseItem['show_product_id'] skuItem['from_site'] = self.name skuItem['current_price'] = sel.xpath('//div[@class="prodprice saleprice"]/p/span[@itemprop="price"]/text()').extract()[0] if len(sel.xpath('//div[@class="prodprice saleprice"]/p/span[@class="basePrice"]/text()').extract()) > 0: skuItem['list_price'] = sel.xpath('//div[@class="prodprice saleprice"]/p/span[@class="basePrice"]/text()').extract()[0] else: skuItem['list_price'] = skuItem['current_price'] skuItem['is_outof_stock'] = False skuItem['color'] = 'one-color' skuItem['size'] = 'one-size' skuItem['id'] = baseItem['show_product_id'] skus.append(skuItem) imageItem = ImageItem() image_url = sel.xpath('//meta[@property="og:image"]/@content').extract()[0] imageItem['image'] = re.sub(r'wid=\d+&hei=\d+', 'wid=1000&hei=1000', image_url) imageItem['thumbnail'] = re.sub(r'wid=\d+&hei=\d+', 'wid=50&hei=50', image_url) images = [] images.append(imageItem) colorItem = Color() colorItem['type'] = 'color' colorItem['show_product_id'] = baseItem['show_product_id'] colorItem['from_site'] = self.name colorItem['images'] = images colorItem['name'] = 'one-color' colorItem['cover'] = imageItem['thumbnail'] yield colorItem baseItem['skus'] = skus yield baseItem
def handle_parse_item(self, response, item): sel = Selector(response) if len(sel.xpath('//input[@id="waitlistSubmit"]').extract()) > 0: return if len(sel.xpath("//button[@id='add-to-cart']")) > 1: return if 'preowned' in response.url: return size_chart_url = sel.xpath( '//div[@class="popover-content"]/img/@src').extract() if len(size_chart_url) > 0: size_chart_url = size_chart_url[0] item['size_info'] = {'size_chart_url': size_chart_url} color_li = sel.xpath('//ul[@class="product-color-list"]/li[1]') colorItem = Color() colorItem['from_site'] = self.name colorItem['show_product_id'] = item['show_product_id'] colorItem['type'] = 'color' if len(color_li) > 0: color_name_text = color_li.xpath('./a/@data-color').extract() cover_text = color_li.xpath('./a/img/@src').extract() if len(cover_text) > 0 and len(color_name_text) > 0: colorItem['cover'] = 'http:' + cover_text[0] colorItem['name'] = color_name_text[0] color_name = color_name_text[0] else: cover_text = color_li.xpath( './a/div[@class="center-cropped"]/@style').re( 'url\(\'(.+)\'\)') if len(cover_text) > 0 and len(color_name_text) > 0: colorItem['cover'] = 'http:' + cover_text[0] colorItem['name'] = color_name_text[0] color_name = color_name_text[0] else: return elif len( sel.xpath( '//span[@class="mz-productoptions-optionvalue"]/text()'). extract()) > 0: color_name_text = sel.xpath( '//span[@class="mz-productoptions-optionvalue"]/text()' ).extract() if len(color_name_text) == 0: return color_name = color_name_text[0] colorItem['name'] = color_name else: color_name = 'one_color' colorItem['name'] = color_name colorImages = [] color_image_array = sel.xpath('//div[@id="productimages"]/img') if len(color_image_array) > 0: for color_image in color_image_array: if len(color_image.xpath('./@src')) > 0: color_image_thumb = 'http:' + color_image.xpath( './@src').extract()[0] else: color_image_thumb = 'http:' + color_image.xpath( './@data-src').extract()[0] if len(color_image.xpath('./@data-zoom')) > 0: color_image_url = 'http:' + color_image.xpath( './@data-zoom').extract()[0] else: color_image_url = color_image_thumb.replace('537', '2160') if 'cover' not in colorItem.keys(): colorItem['cover'] = color_image_thumb.replace('537', '40') colorImages.append({ 'thumbnail': color_image_thumb, 'image': color_image_url }) colorItem['images'] = colorImages yield colorItem item['colors'] = [color_name] item['dimensions'] = ['size'] # if sel.xpath('//div[@class="mz-productoptions-valuecontainer"]').extract() <= 0: # item['size'] = 'One Size' skus = [] sizes = [] sku_spans = sel.xpath( '//span[@class="mz-productoptions-sizebox "] | //span[@class="mz-productoptions-sizebox selected-box"]' ) sku_color = sel.xpath( '//div[@class="mz-productoptions-optioncontainer colorList"]/div/span[@class="mz-productoptions-optionvalue"]/text()' ) if len(sku_color) > 0: sku_color = sku_color.extract()[0] else: sku_color = 'one_color' if len(sku_spans) > 0: for sku_span in sku_spans: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['from_site'] = self.name skuItem['show_product_id'] = item['show_product_id'] skuItem['id'] = item['show_product_id'] + '-' + sku_span.xpath( './@data-value').extract()[0] list_price = sel.xpath( '//div[@class="mz-price is-crossedout"]/text()') current_price = sel.xpath('//div[@class="mz-price"]/text()') if len(current_price) > 0: if '-' in current_price.extract()[0]: current_price = current_price.re('-\s*\$(\S+)')[0] else: current_price = current_price.re(r'(\S+)')[0] elif len( sel.xpath( '//div[@class="mz-price is-saleprice"]/text()') ) > 0: current_price = sel.xpath( '//div[@class="mz-price is-saleprice"]/text()') if '-' in current_price.extract()[0]: current_price = current_price.re('-\s*\$(\S+)')[0] else: current_price = current_price.re(r'(\S+)')[0] if len(list_price) > 0: if re.findall('Retail', list_price.extract()[0]): list_price = list_price.re(r'[\d\.]+')[0] else: list_price = list_price.re(r'(\S+)')[0] else: list_price = current_price skuItem["list_price"] = list_price skuItem['current_price'] = current_price skuItem['color'] = color_name skuItem['size'] = sku_span.xpath('text()').extract()[0].strip() skuItem['is_outof_stock'] = False sizes.append(skuItem['size']) skus.append(skuItem) else: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['from_site'] = self.name skuItem['show_product_id'] = item['show_product_id'] skuItem['id'] = item['show_product_id'] list_price = sel.xpath( '//div[@class="mz-price is-crossedout"]/text()') current_price = sel.xpath('//div[@class="mz-price"]/text()') if len(current_price) > 0: current_price = current_price.re(r'(\S+)')[0] elif len(sel.xpath( '//div[@class="mz-price is-saleprice"]/text()')) > 0: current_price = sel.xpath( '//div[@class="mz-price is-saleprice"]/text()').re( r'(\S+)')[0] if len(list_price) > 0: if re.findall('Retail', list_price.extract()[0]): list_price = list_price.re(r'[\d\.]+')[0] else: list_price = list_price.re(r'(\S+)')[0] else: list_price = current_price skuItem["list_price"] = list_price skuItem['current_price'] = current_price skuItem['color'] = color_name skuItem['size'] = 'One Size' skuItem['is_outof_stock'] = False sizes.append(skuItem['size']) skus.append(skuItem) item['sizes'] = sizes item['skus'] = skus desc_div = sel.xpath( '//div[@class="mz-productdetail-description"]/text()').extract() desc_lis = sel.xpath( '//ul[@class="mz-productdetail-props"]/li').extract() if len(desc_div) > 0: item['desc'] = desc_div[0] else: item['desc'] = '' if len(desc_lis) > 0: item['desc'] += ''.join(desc_lis) yield item
def handle_parse_item(self, response, item): skus = [] sel = Selector(response) item['from_site'] = self.name if 'whoops' in response.url: logging.warning('anti scraping: ' + response.url) match = re.search(r'"product_id":\s*\[\s*"([^"]+)"\s*\]', response.body) if match is None: return temp_show_product_id = match.group(1) current_price = sel.xpath( '//span[contains(@class, "price-sales")]/text()').extract() if len(current_price) > 0: current_price = current_price[0] list_price = sel.xpath( '//span[contains(@class, "price-standard")]/text()').re( r'(\S+)') if len(list_price) > 0: list_price = list_price[0] else: list_price = current_price item['brand'] = self.name # item['desc']=".".join(sel.xpath('//div[contains(@class, "additional")]/ul/li/text()').extract()) desc1 = sel.xpath( '//div[@class="categorylisting detail"]/div/div').extract()[0] desc2 = sel.xpath( '//div[@class="categorylisting fabric"]/div/div').extract()[0] item['desc'] = re.sub(r'[\t\n]', '', desc1 + desc2) item['desc'] = re.sub('<img.+?>', '', item['desc']) if sel.xpath( '//div[contains(@class, "quantity clearfix")]//p[contains(@class, "in-stock-msg")]/text()' ): colors = [] item_colors_links = sel.xpath( '//div[@id="product-content"]//ul[contains(@class, "swatches color")]//li[contains(@class,"selected")]/a' ) item_sizes = sel.xpath( '//div[@id="product-content"]//div[contains(@class, "value")]//ul[contains(@class, "swatches size")]/li[@class!="emptyswatch unselectable"]//@title' ).extract() item['sizes'] = item_sizes item['dimensions'] = ['size'] item['product_type'] = 'mother-baby' if len(item_colors_links) == 0: item_colors_links = ['one_color'] for item_color_link in item_colors_links: images = [] thumbnails = sel.xpath( '//div[@id="thumbnails"]//li[@class!="thumb pdpvideo"]' ) if thumbnails: for li in thumbnails: #thumbnails_evl=li.xpath('./a/img/@src').extract()[0] imageItem = ImageItem() image_url = li.xpath('./a/img/@src').extract()[0] imageItem['image'] = self.handle_image_url( image_url.encode('utf-8'), 1000, 1000) imageItem['thumbnail'] = self.handle_image_url( image_url.encode('utf-8'), 350, 350) images.append(imageItem) elif sel.xpath( '//div[@id="thumbnails"]/li[@class="thumb pdpvideo"]/a/img/@src' ): imageItem = ImageItem() image_url = sel.xpath( '//img[@class="primary-image"]/@src').extract()[0] imageItem['image'] = self.handle_image_url( image_url.encode('utf-8'), 1000, 1000) imageItem['thumbnail'] = self.handle_image_url( image_url.encode('utf-8'), 350, 350) images.append(imageItem) else: imageItem = ImageItem() image_url = sel.xpath( '//img[@class="primary-image"]/@src').extract()[0] imageItem['image'] = self.handle_image_url( image_url.encode('utf-8'), 1000, 1000) imageItem['thumbnail'] = self.handle_image_url( image_url.encode('utf-8'), 350, 350) images.append(imageItem) if len(item_colors_links) > 0: color_name = item_color_link.xpath( './@title').extract()[0] color_cover = item_color_link.xpath('./@style').re( 'http://[^\)]+')[0] else: color_name = 'one_color' color_cover = images[0]['thumbnail'] colors.append(color_name) show_product_id = temp_show_product_id + "*" + color_name item['show_product_id'] = show_product_id color = Color() color['type'] = 'color' color['from_site'] = self.name color['show_product_id'] = show_product_id color['images'] = images color['name'] = color_name color['cover'] = color_cover yield color for item_size in item_sizes: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = show_product_id skuItem[ 'id'] = item['show_product_id'] + "*" + item_size skuItem['current_price'] = current_price skuItem['list_price'] = list_price if len(item_colors_links) > 0: skuItem['color'] = item_color_link.xpath( './@title').extract()[0] else: skuItem['color'] = 'one_color' skuItem['size'] = item_size skuItem['from_site'] = self.name skuItem['is_outof_stock'] = False skuItem['quantity'] = sel.xpath( '//select[contains(@id, "Quantity")]//@value' ).extract()[0] #yield skuItem skus.append(skuItem) item['colors'] = colors item['skus'] = skus yield item
def handle_parse_item(self, response, item): body_json = json.loads(response.body) goods_detail = body_json['data'] if goods_detail['inStock'] == 0: return item['linkhaitao_url'] = response.url item['cover'] = goods_detail['coverImgUrl'] item['desc'] = goods_detail['content']['description'] if 'product_type_id' in os.environ.keys(): self.product_type_id = os.environ['product_type_id'] if 'category_id' in os.environ.keys(): self.category_id = os.environ['category_id'] if self.product_type_id: item['product_type_id'] = int(self.product_type_id) item['product_type'] = 'linkhaitao_' + str(self.product_type_id) if self.category_id: item['category_id'] = int(self.category_id) item['category'] = 'linkhaitao_' + str(self.category_id) if 'editor_flag' in os.environ.keys(): self.editor_flag = os.environ['editor_flag'] if self.editor_flag: item['editor_flag'] = self.editor_flag if 'gender' in os.environ.keys(): self.gender = os.environ['gender'] if self.gender: item['gender'] = self.gender item['dimensions'] = ['size', 'color'] item['brand'] = goods_detail['brand']['name_en'] item['title'] = goods_detail['name'] item['current_price'] = goods_detail['realPriceOrg'] item['list_price'] = goods_detail['mallPriceOrg'] from_site = ''.join(goods_detail['sellerName']['namecn'].split()).lower() if self.is_chinese_word(from_site): from_site = ''.join(goods_detail['sellerName']['namecn'].split()).lower() if "'" in from_site: from_site = from_site.replace("'", "") if '/' in from_site: from_site = from_site.split('/')[0] item['from_site'] = from_site if item['from_site'] == '6pm' or item['from_site'] == '6pm/6pm': item['from_site'] = 'sixpm' spu_id = re.search('spuid=(.+)&?',response.url) if spu_id: spu_id = spu_id.group(1) else: spu_id = re.search('&spu=(.+)&?',response.url).group(1) item['show_product_id'] = spu_id item['url'] = goods_detail['pageUrl'] if self.editor_flag: item['editor_flag'] = self.editor_flag if not goods_detail['skuInfo']: colorItem = Color() images = [] color_names = [] skus=[] for image in goods_detail['coverImgList']: imageItem = ImageItem() imageItem['image'] = image imageItem['thumbnail'] = image images.append(imageItem.copy()) colorItem['images'] = images colorItem['type'] = 'color' colorItem['from_site'] = item['from_site'] colorItem['show_product_id'] = item['show_product_id'] color_name = 'One Color' if color_name not in color_names: color_names.append(color_name) colorItem['name'] = color_name colorItem['cover'] = goods_detail['coverImgUrl'] yield colorItem skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = item['show_product_id'] skuItem['list_price'] = item['list_price'] skuItem['current_price'] = item['current_price'] skuItem['color'] = color_name skuItem['id'] = item['show_product_id'] + 'onecolor' skuItem['from_site'] = item['from_site'] if goods_detail['inStock'] == 0: skuItem['is_outof_stock'] = True skuItem['size'] = 'One Size' skus.append(skuItem) item['sizes'] = ['One Size'] else: skus_info = goods_detail['skuInfo']['style']['skustylelist'] color_names = [] skus = [] sizes = [] dimensions_values = {} for sku_info in skus_info: colorItem = Color() images = [] for image in sku_info['coverImgList']: imageItem = ImageItem() imageItem['image'] = image imageItem['thumbnail'] = image images.append(imageItem.copy()) colorItem['images'] = images colorItem['type'] = 'color' colorItem['from_site'] = item['from_site'] colorItem['show_product_id'] = item['show_product_id'] if sku_info['style']: color_name = sku_info['style'] else: color_name = 'One Color' if color_name not in color_names: color_names.append(color_name) colorItem['name'] = color_name colorItem['cover'] = images[0]['image'] yield colorItem for sku in sku_info['data']: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = item['show_product_id'] skuItem['list_price'] = sku['mallPriceOrg'] skuItem['current_price'] =sku['realPriceOrg'] skuItem['color'] = color_name skuItem['id'] = sku['skuid'] skuItem['from_site'] = item['from_site'] if sku['inStock'] == 0: skuItem['is_outof_stock'] = True if len(sku['attr']) == 0: skuItem['size'] = 'One Size' if skuItem['size'] not in sizes: sizes.append(skuItem['size']) else: skuItem['size'] = {} for attr in sku['attr']: skuItem['size'][attr['attrName'].lower()] = attr['attrValue'] if attr['attrName'].lower() not in item['dimensions']: item['dimensions'].append(attr['attrName'].lower()) if attr['attrName'].lower() not in dimensions_values.keys(): dimensions_values[attr['attrName'].lower()] = [attr['attrValue']] else: dimensions_values[attr['attrName'].lower()].append(attr['attrValue']) if 'size' not in skuItem['size'].keys(): skuItem['size']['size'] = 'One Size' dimensions_values['size'] = 'One Size' skus.append(skuItem) if sizes: item['sizes'] = sizes elif dimensions_values: item['sizes'] = dimensions_values else: return item['skus'] = skus item['colors'] = color_names yield item
def handle_parse_item(self, response, item): '''第一种情况''' sel = Selector(response) product_json_str_dom = sel.xpath( '//script[@id="productMainData"]/text()').extract() if len(product_json_str_dom) > 0: product_json_str = product_json_str_dom[0] product_json = json.loads(product_json_str) show_product_id = product_json['id'] h = HTMLParser() title = h.unescape(h.unescape(product_json['title'])) cover = product_json['imageUrl'] desc = sel.xpath( '//section[contains(@class, "product-details-content")]' ).extract()[0] brand = product_json['brandName'] '''color handle''' color_covers = product_json['colorSwatchMap'] color_primary_iamges = product_json['images'][ 'colorwayPrimaryImages'] colors = [] color_items = {} color_handle_map = {} for color_name in color_primary_iamges: if color_name in color_covers.keys(): color_cover = self.image_prefix + color_covers[color_name] else: color_cover = self.image_prefix + color_primary_iamges[ color_name] colorItem = Color() colorItem['type'] = 'color' colorItem['from_site'] = 'macys' colorItem['show_product_id'] = show_product_id colorItem['name'] = color_name colorItem['cover'] = color_cover images = [] '''颜色主图片''' if color_name in product_json['images'][ 'colorwayPrimaryImages'].keys(): image = self.image_prefix + product_json['images'][ 'colorwayPrimaryImages'][color_name] images.append({ 'thumbnail': image, 'image': image + '?wid=1000' }) '''颜色附加图片''' color_additional_handled = False if color_name in product_json['images'][ 'colorwayAdditionalImages'].keys(): color_additional_images_str = product_json['images'][ 'colorwayAdditionalImages'][color_name] color_additional_images = color_additional_images_str.split( ',') for color_additional_image in color_additional_images: image = self.image_prefix + color_additional_image images.append({ 'thumbnail': image, 'image': image + '?wid=1000' }) color_additional_handled = True '''通用附加图片''' additional_handled = False if len(product_json['images']['additionalImages']) > 0: additional_handled = True additional_images = product_json['images'][ 'additionalImages'] for additional_image in additional_images: image = self.image_prefix + additional_image images.append({ 'thumbnail': image, 'image': image + '?wid=1000' }) colorItem['images'] = images color_items[color_name] = colorItem if color_additional_handled == True or additional_handled == True: color_handle_map[color_name] = True else: color_handle_map[color_name] = False colors.append(color_name) for color_name in color_handle_map: if color_handle_map[color_name] == False: selected_color_name = product_json['selectedColor'] if selected_color_name in product_json['images'][ 'colorwayAdditionalImages'].keys(): color_additional_images_str = product_json['images'][ 'colorwayAdditionalImages'][selected_color_name] color_additional_images = color_additional_images_str.split( ',') for color_additional_image in color_additional_images: image = self.image_prefix + color_additional_image color_items[color_name]['images'].append({ 'thumbnail': image, 'image': image + '?wid=1000' }) for color_item_name in color_items: yield color_items[color_item_name] '''color handle end''' '''handle item info begin''' item['title'] = title item['brand'] = brand item['cover'] = cover item['desc'] = desc item['colors'] = colors item['show_product_id'] = show_product_id sizes = {'size': product_json['sizesList']} item['sizes'] = sizes if item['sizes']['size'] == []: item['sizes'] = ['One Size'] item['dimensions'] = ['size'] upc_list = product_json['upcMap'][show_product_id] color_price_map = {} for price in product_json['colorwayPricingSwatches']: price_map = product_json['colorwayPricingSwatches'][price] for color_name in price_map: color_price = price_map[color_name] if color_price['onSale'] == False: current_price = color_price['tieredPrice'][0]['value'][ 0] list_price = current_price else: color_price_len = len(color_price['tieredPrice']) current_price = color_price['tieredPrice'][ color_price_len - 1]['value'][0] list_price = color_price['tieredPrice'][0]['value'][0] color_price_map[color_name] = { 'current_price': current_price, 'list_price': list_price } skuCollectionsList = [] for sku_stock in upc_list: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['from_site'] = 'macys' skuItem['show_product_id'] = item['show_product_id'] skuItem['id'] = sku_stock['upcID'] if len(color_price_map) == 0 or sku_stock[ 'color'] not in color_price_map.keys(): skuItem["list_price"] = product_json['regularPrice'] skuItem['current_price'] = product_json['salePrice'] if len(skuItem['current_price']) == 0: skuItem['current_price'] = skuItem["list_price"] else: skuItem["list_price"] = color_price_map[ sku_stock['color']]['list_price'] skuItem['current_price'] = color_price_map[ sku_stock['color']]['current_price'] skuItem['color'] = sku_stock['color'] if not item['colors']: item['colors'] = [skuItem['color']] skuItem['size'] = sku_stock['size'] if not skuItem['size']: skuItem['size'] = 'One Size' if sku_stock['isAvailable'] == "true": skuItem['is_outof_stock'] = False else: skuItem['is_outof_stock'] = True skuCollectionsList.append(skuItem) item['skus'] = skuCollectionsList if product_json['sizeChartMap'][show_product_id][ 'sizeChartCanvasId']: size_chart_url = self.base_url + '/shop/catalog/product/canvassizechart/json?canvasId=' + product_json[ 'sizeChartMap'][show_product_id]['sizeChartCanvasId'] yield Request(url=size_chart_url, meta={'item': item}, callback=self.parse_size_chart) elif product_json['sizeChartMap'][show_product_id]['sizeChart']: item['size_info'] = self.size_chart_pic_url + product_json[ 'sizeChartMap'][show_product_id]['sizeChart'] yield item elif product_json['sizeChartMap'][show_product_id][ 'intlSizeChart']: item['size_info'] = self.size_chart_pic_url + product_json[ 'sizeChartMap'][show_product_id]['intlSizeChart'] yield item else: item['size_info'] = '' yield item else: member_url_doms = sel.xpath( '//div[contains(@class, "memberProducts")]') if len(member_url_doms) > 0: for member_url_dom in member_url_doms: url = member_url_dom.xpath('./@data-pdp-url').extract()[0] url = self.base_url + url yield Request(url=url, meta={'item': item}, callback=self.parse_item)
def handle_parse_item(self, response, item): sel = Selector(response) if not sel.xpath('//h1[contains(@class, "prod-title")]'): print get_base_url(response) return if sel.xpath('//font[@class="prodError"]'): return show_product_id = sel.xpath('//span[contains(@class, "style-num")]/text()').extract()[0] baseItem = item baseItem['type'] = 'base' baseItem['url'] = get_base_url(response) baseItem['title'] = sel.xpath('//h1[contains(@class, "prod-title")]/text()').extract()[0] baseItem['cover'] = item['cover'] if sel.xpath('//span[contains(@itemprop, "offers")]//span[contains(@class, "sale-price")]'): if sel.xpath('//span[contains(@itemprop, "offers")]//span[contains(@class, "sale-price")]//span[2]'): baseItem['current_price'] = sel.xpath('//span[contains(@itemprop, "offers")]//span[contains(@class, "sale-price")]//span[2]/text()').extract()[0] else: baseItem['current_price'] = sel.xpath('//span[contains(@itemprop, "offers")]//span[contains(@class, "sale-price")]//span/text()').extract()[0] elif sel.xpath('//span[contains(@class, "reg-price")]//span[contains(@itemprop, "price")]'): #baseItem['current_price'] = sel.xpath('//span[contains(@itemprop, "price")]/text()').extract()[0] baseItem['current_price'] = sel.xpath('//span[contains(@class, "reg-price")]//span[contains(@itemprop, "price")]/text()').extract()[0] elif sel.xpath('//span[contains(@class, "reg-price")]//span[contains(@itemprop, "highPrice")]'): baseItem['current_price'] = sel.xpath('//span[contains(@class, "reg-price")]//span[contains(@itemprop, "highPrice")]/text()').extract()[0] else : return if sel.xpath('//span[contains(@class, "reg-price is-sale")]'): list_pri = sel.xpath('//span[contains(@class, "reg-price is-sale")]/text()').extract()[0] if list_pri.find('-') == -1: baseItem['list_price'] = list_pri else: baseItem['list_price'] = list_pri.split("-")[1] else: baseItem['list_price'] = baseItem['current_price'] baseItem['show_product_id'] = show_product_id baseItem['desc'] = sel.xpath('//div[contains(@class, "detail")]//ul').extract()[0] baseItem['brand'] = 'Ralph Lauren' baseItem['from_site'] = 'ralphlauren' baseItem['product_type'] = item['product_type'] baseItem['category'] = item['category'] #baseItem['sub_category'] = baseItem['gender'] = item['gender'] if sel.xpath('//a[contains(@id, "sizechart")]'): baseItem['size_info'] = '%s%s' % (self.base_url, sel.xpath('//a[contains(@id, "sizechart")]/@href').extract()[0] ) else: baseItem['size_info'] = '' #jsStr = ",".join(re.findall(r'itemMap.*[\s]=[\s]*({[^}]+}[\s]*);', response.body)) #### jsStr2 = "".join(re.findall(r'<script>[\s]*(var isTablet.*;)[\s]*</script>[\s]*<div class="prod-utility">', response.body, re.S)) strinfo = re.compile('var isTablet.*;') imgStr = strinfo.sub('var altImages = new Array();var Scene7Map = new Array();', jsStr2) #print imgStr context2 = execjs.compile(''' %s function getImages(){ return orderedImageMap_0; } function getImages2(){ var imageArr = new Array() for (i in altImages){ for (j in altImages[i]) { altImages[i][j]["cId"] = i } imageArr.push(altImages[i]) } return imageArr; } function getScene7Map(){ var Scene7Maps = new Array(); var cIds = new Array(); for (i in Scene7Map){ if (i.toString().indexOf("c") != -1 ){ cId = i.toString().substr(1, i.length-1) cIds.push(cId) } } for (ii in Scene7Map){ for (jj in cIds) { s7Index = "s7" + cIds[jj] if (ii == s7Index) { Scene7Maps.push({ "cId":cIds[jj], "cValue":Scene7Map[ii]}) } } } return Scene7Maps } ''' % imgStr.decode('cp1252').encode('utf-8') ) getImages = context2.call('getImages') getImages2 = context2.call('getImages2') Scene7Map = context2.call('getScene7Map') imgsArr = [] for imgTmp in getImages: imgsArr_tmp = [] #replace pic for STmp in Scene7Map: if STmp['cId'] == imgTmp['cId']: imgTmp['v400'] = 'http://s7d2.scene7.com/is/image/PoloGSI/%s?$flyout_main$&cropN=0.12,0,0.7993,1&iv=fLNd_3&wid=1410&hei=1770&fit=fit,1' % (STmp['cValue']) imgsArr_tmp.append({"image":imgTmp['v400'], "thumbnail": imgTmp['x50']}) #video if 'vid' in imgTmp.keys(): item['media_url'] = 'http://s7d2.scene7.com/is/content/PoloGSI/' + imgTmp['vid'] for imgTmp2 in getImages2: for imgTmp22 in imgTmp2: if imgTmp['cId'] == imgTmp22['cId']: if imgTmp22['t940'] == '' and imgTmp22['x50'] != '': imgTmp22['t940'] = imgTmp22['x50'].replace('_t50','_t940') elif imgTmp22['t940'] != '' and imgTmp22['x50'] == '': imgTmp22['x50'] = imgTmp22['t940'].replace('_t940','_t50') imgsArr_tmp.append({"image":imgTmp22['t940'], "thumbnail": imgTmp22['x50']}) imgsArr.append( {"cId": imgTmp['cId'], "pics": imgsArr_tmp} ) color_col = sel.xpath('//ul[contains(@id, "color-swatches")]//li') for colors in color_col: color_id = colors.xpath('./@data-value').extract()[0] cover = colors.xpath('.//img/@src').extract()[0] name = colors.xpath('.//img/@title').extract()[0] images = [] for img in imgsArr: if img['cId'] == color_id: images = img['pics'] color = Color() color['type'] = 'color' color['show_product_id'] = show_product_id color['from_site'] = 'ralphlauren' color['cover'] = cover color['images'] = images color['name'] = name yield color #### jsStr1 = "".join(re.findall(r'<script>[\s]*(var itemMap.*;)[\s]*</script>[\s]*<!--previousURL', response.body, re.S)) context1 = execjs.compile(''' %s function getItemMaps(){ return itemMap; } ''' % jsStr1.decode('cp1252').encode('utf-8')) skus = [] sizes = [] colors = [] getItemMaps = context1.call('getItemMaps') for ItemMaps in getItemMaps: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = show_product_id skuItem['from_site'] = 'ralphlauren' #skuItem['id'] = show_product_id + '-' +ItemMaps['sDesc'] skuItem['id'] = ItemMaps['sku'] skuItem['list_price'] = baseItem['list_price'] skuItem['current_price'] = ItemMaps['price'] skuItem['size'] = ItemMaps['sDesc'] skuItem['color'] = ItemMaps['cDesc'] if ItemMaps['avail'] == "OUT_OF_STOCK": skuItem['is_outof_stock'] = True else: skuItem['is_outof_stock'] = False skuItem['quantity'] = ItemMaps['quantityOnHand'] sizes.append(ItemMaps['sDesc']) skus.append(skuItem) colors.append(ItemMaps['cDesc']) baseItem['skus'] = skus baseItem['sizes'] = list(set(sizes)) baseItem['colors'] = list(set(colors)) yield baseItem
def handle_parse_item(self, response, baseItem): sel = Selector(response) # bread_crumbs=sel.xpath('//div[@id="divBreadCrumb"]/span') # baseItem['product_type']= # baseItem['category']=bread_crumbs[2].xpath('./a/text()').extract()[0] # import pdb;pdb.set_trace() product_id_re_result = re.findall(r'dtmProductId = [^;]+', response.body) if product_id_re_result and len(product_id_re_result) > 0: product_id_str = product_id_re_result[0] product_id = re.findall(r'\d+', product_id_str)[0] baseItem['show_product_id'] = int(product_id) #baseItem['sub_category']=bread_crumbs[3].xpath('./a/text()').extract()[0] baseItem['type'] = 'base' item_in_stock = sel.xpath('//div[@id="divAvailablity"]') if len(item_in_stock) > 0: baseItem['title'] = sel.xpath( '//div[@id="divCaption"]/h1/text()').extract()[0] desc_a = sel.xpath( '//table[@id="TblProdForkPromo"]//td[@class="contenttd"]' ).extract() desc_b = sel.xpath( '//table[@id="TblProdForkWarnings"]//td[@class="contenttd"]' ).extract() if len(desc_a) > 0: baseItem['desc'] = desc_a[0] if len(desc_b) > 0: baseItem['desc'] = desc_b[0] baseItem['colors'] = ['onecolor'] baseItem['sizes'] = ['onesize'] baseItem['dimensions'] = ['size', 'color'] baseItem['from_site'] = self.name imageItem = ImageItem() images = [] imageItem['thumbnail'] = sel.xpath( '//div[@id="divPImage"]//img/@src').extract()[0] imageItem['image'] = re.sub(r'300(\.\w+)', '500\\1', imageItem['thumbnail']) images.append(imageItem) colorItem = Color() colorItem['type'] = 'color' colorItem['from_site'] = self.name colorItem['show_product_id'] = baseItem['show_product_id'] colorItem['images'] = images colorItem['cover'] = imageItem['thumbnail'] colorItem['name'] = 'onecolor' # import pdb;pdb.set_trace() yield colorItem skus = [] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = baseItem['show_product_id'] skuItem['from_site'] = self.name list_price = sel.xpath('//span[@id="rowMSRP"]') if len(list_price) > 0: skuItem['list_price'] = list_price.xpath( './s/text()').extract()[0] skuItem['current_price'] = sel.xpath( '//div[@id="productprice"]/span/text()').extract()[0] baseItem['list_price'] = skuItem['list_price'] baseItem['current_price'] = skuItem['current_price'] else: skuItem['current_price'] = sel.xpath( '//div[@id="productprice"]/span/text()').extract()[0] skuItem['list_price'] = skuItem['current_price'] baseItem['list_price'] = skuItem['list_price'] baseItem['current_price'] = skuItem['current_price'] skuItem['is_outof_stock'] = False skuItem['color'] = 'onecolor' skuItem['id'] = baseItem['show_product_id'] skuItem['size'] = 'onesize' skus.append(skuItem) baseItem['skus'] = skus yield baseItem
def handle_parse_item(self, response, item): sel = Selector(response) catalogId = sel.xpath( '//input[contains(@id, "catalogId")]/@value').extract()[0] storeId = sel.xpath( '//input[contains(@id, "storeId")]/@value').extract()[0] show_product_id = sel.xpath( '//input[contains(@id, "productId")]/@value').extract()[0] #baseItem = response.meta['baseItem'] baseItem = item baseItem['from_site'] = self.name baseItem['type'] = 'base' baseItem['title'] = sel.xpath( '//div[contains(@class, "pdd_title box")]//h3/text()').extract()[0] baseItem['show_product_id'] = show_product_id baseItem['desc'] = sel.xpath( '//div[contains(@class, "pdd_desc pdd_sub_item box")]').extract( )[0] baseItem['list_price'] = sel.xpath( '//p[contains(@class, "pdd_price")]//span/text()').extract()[0] if sel.xpath('//span[contains(@class, "promo_price")]'): baseItem['current_price'] = sel.xpath( '//span[contains(@class, "promo_price")]/text()').extract()[0] else: baseItem['current_price'] = baseItem['list_price'] colors = [] skus = [] colorNames = [] sizes = {'size': ['onesize']} color_coloum = sel.xpath( '//ul[contains(@class, "pdd_colors_list box")]//li') images_coloum = sel.xpath( '//div[contains(@id, "rl_pdd_cover_slider")]//ul[contains(@class, "box")]//li' ) for colors_col in color_coloum: color = Color() color['type'] = 'color' color['from_site'] = self.name color['show_product_id'] = show_product_id color['cover'] = colors_col.xpath('./a/img/@src').extract()[0] color['name'] = colors_col.xpath('./a/img/@alt').extract()[0] colorNames.append(color['name']) if colors_col.xpath( './a[contains(@class, "pdd_color pdd_color_picked")]//span/@onclick' ): images = [] for images_col in images_coloum: imageItem = ImageItem() imageItem['image'] = images_col.xpath( './img/@src').extract()[0] imageItem['thumbnail'] = images_col.xpath( './img/@small').extract()[0] images.append(imageItem) color['images'] = images yield color else: #print re.findall(r'\(.*?\)', clickParam) clickParam = colors_col.xpath( './a[contains(@class, "pdd_color")]//span/@onclick' ).extract()[0] clickParams = re.findall(r"'(.*)'", clickParam)[0].split(',') MFPARTNUMBER = clickParams[5].replace("'", "") imgUrl = '%s/webapp/wcs/stores/servlet/ProductDetailFullImageView?catalogId=%s&langId=-1&storeId=%s&MFPARTNUMBER=%s' % ( self.base_url, catalogId, storeId, MFPARTNUMBER) yield Request(imgUrl.encode('UTF-8'), meta={'color': color}, callback=self.parse_img) ''' quantityUrl = 'http://www.ralphlauren.asia/webapp/wcs/stores/servlet/ProductDetailQuantityView?catalogId=12551&langId=-1&storeId=12151' formdata = { 'SKUId': str(clickParam[2]), 'objectId':'', 'requesttype':'ajax' } yield FormRequest(url=sizeUrl, formdata=formdata, callback=self.parse_quantity, meta={ '': '' } ) sizeUrl = self.base_url + '/webapp/wcs/stores/servlet/ProductDetailSizeSelectView?catalogId=12551&langId=-1&storeId=12151' formdata = { 'Id': str(clickParam[1]), 'SKUId': str(clickParam[2]), 'Color': 'Lime', 'ColorId': str(clickParam[5]), 'Size': '', 'InItemSppSplitChar':'@@', 'objectId':'', 'requesttype':'ajax' } yield FormRequest(url=sizeUrl, formdata=formdata, callback=self.parse_size, meta={ 'color': color } ) ''' ### sku_coloum = sel.xpath( '//select[contains(@id, "rl_pdd_size")]//option ') for sku_col in sku_coloum: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = show_product_id skuItem['from_site'] = self.name skuItem['list_price'] = baseItem['list_price'] skuItem['current_price'] = baseItem['current_price'] skuItem['is_outof_stock'] = False skuItem['id'] = sel.xpath( '//input[contains(@id, "selectSKUId")]/@value').extract()[0] skuItem['color'] = sel.xpath( '//span[contains(@class, "pdd_current_color")]/text()' ).extract()[0] #skuItem['size'] = sku_col.xpath('./option/text()').extract()[0] skuItem['quantity'] = sel.xpath( '//select[contains(@id, "rl_pdd_qty")]//option/@value' ).extract()[0] skus.append(skuItem) baseItem['colors'] = colorNames baseItem['sizes'] = sizes baseItem['skus'] = skus baseItem['dimensions'] = "" baseItem['brand'] = 'ralph lauren' baseItem['category'] = sel.xpath( '//div[contains(@class, "bread bread_bar")]//a[4]/text()').extract( )[0] baseItem['product_type'] = sel.xpath( '//div[contains(@class, "bread bread_bar")]//a[3]/text()').extract( )[0] yield baseItem
def handle_parse_item(self, response, item): sel = Selector(response) product_id_div = sel.xpath( '//div[@id="storeCatalogEntryID"]/text()').extract() if len(product_id_div) == 0 or len(product_id_div[0].strip()) == 0: return product_id = sel.xpath( '//div[@id="storeCatalogEntryID"]/text()').extract()[0].strip() item['show_product_id'] = product_id sku_infos_str = sel.xpath('//div[@id="entitledItem_' + product_id + '"]/text()').re(r'\[[\s\S]+\]')[0].strip() context = execjs.compile(''' var sku_info = %s; function getSkuInfo(){ return sku_info; } ''' % sku_infos_str) sku_infos = context.call('getSkuInfo') dimensions = set([]) sizes = {} colors = [] color_names = [] color_name_images_map = {} color_list = sel.xpath('//ul[@class="detail_color"]/li') if color_list: for color_li in color_list: color_item = Color() color_item['show_product_id'] = product_id color_item['from_site'] = self.name color_item['type'] = 'color' color_item['name'] = color_li.xpath( './a//div[@class="colorName"]/span/text()').extract( )[0].strip() color_names.append(color_item['name']) colors.append(color_item) else: color_list = sel.xpath('//div[@class="color_swatch_list"]')[:-1] if color_list: for color_li in color_list: for color_l in color_li.xpath('./ul/li'): c = color_l.xpath('./a/@title').extract()[0] color_item = Color() color_item['show_product_id'] = product_id color_item['from_site'] = self.name color_item['type'] = 'color' color_item['name'] = c color_names.append(color_item['name']) colors.append(color_item) else: color_item = Color() color_item['name'] = 'one color' color_names.append(color_item['name']) colors.append(color_item) sel.xpath('//ul[@id="thumb1"]/li') skus = [] for sku_info in sku_infos: sku_item = SkuItem() sku_id = sku_info['catentry_id'] sku_item['id'] = sku_id sku_item['type'] = 'sku' sku_item['from_site'] = self.name sku_item['show_product_id'] = product_id attributes = sku_info['Attributes'] sku_size = {} if attributes == {}: attributes = {'vendorcolor_one color', 'size_one size'} else: for tempKey in attributes.keys(): if tempKey.find('Size') != -1: temp = 1 break else: temp = 0 if temp == 0: attributes['size_one size'] = '2' for attribute in attributes: keys = attribute.split('_') dimension = keys[0].lower() value = keys[1] if dimension == 'vendorcolor': dimension = 'color' if value not in color_name_images_map.keys(): if 'ItemThumbnailImage' not in sku_info.keys(): return thumbnail = sku_info['ItemThumbnailImage'] if not re.match(r'^http', thumbnail): thumbnail = 'http:' + thumbnail image = sku_info['ItemImage'] if not re.match(r'^http', image): image = 'http:' + image image = image + '&wid=970&hei=1245&fit=fit,1' color_name_images_map[value] = { 'images': [{ 'thumbnail': thumbnail, 'image': image }] } color_cover = sku_info['ItemSwatchImage2'] if not re.match(r'^http', color_cover): color_cover = 'http:' + color_cover color_name_images_map[value]['cover'] = color_cover dimensions.add(dimension) sku_size[dimension] = value if dimension != 'color': if dimension not in sizes.keys(): sizes[dimension] = set([]) sizes[dimension].add(value) sku_item['size'] = sku_size if sku_info['offerPrice'] == '': return else: sku_item['current_price'] = sku_info['offerPrice'] if sku_info['listPrice'] == '': return else: sku_item['list_price'] = sku_info['listPrice'] sku_item['current_price'] = handle_price(sku_item['current_price']) sku_item['list_price'] = handle_price(sku_item['list_price']) if float(sku_item['list_price']) < float( sku_item['current_price']): sku_item['list_price'] = sku_item['current_price'] if sku_info['availableQuantity']: sku_item['quantity'] = int(float( sku_info['availableQuantity'])) else: sku_item['quantity'] = 0 sku_item['is_outof_stock'] = sku_info['outOfStock'] if 'color' not in sku_item.keys(): sku_item['color'] = 'one color' elif sku_item['color'] == {}: sku_item['color'] = 'one color' if 'size' not in sku_item.keys(): sku_item['size'] = 'one size' elif sku_item['size'] == {}: sku_item['size'] = 'one size' skus.append(sku_item) for color in colors: if color['name'] in color_name_images_map.keys(): color['images'] = color_name_images_map[ color['name']]['images'] color['cover'] = color_name_images_map[color['name']]['cover'] yield color item['dimensions'] = list(dimensions) for size_key in sizes: sizes[size_key] = list(sizes[size_key]) item['sizes'] = sizes item['colors'] = color_names item['skus'] = skus item['desc'] = sel.xpath('//div[@id="detial_main_content"]').extract() if item['desc']: item['desc'] = item['desc'][0] else: item['desc'] = sel.xpath( '//div[@class="descriptionsContent"]').extract()[0] yield item
def handle_parse_item(self, response, item): sel = Selector(response) product_str = re.search('var dataLayer\s*=\s*(.+?);', response.body).group(1) product_json = json.loads(product_str) product = product_json['products'][0] item['brand'] = 'Tiffany' item['title'] = product['name'] if not product['price']: return item['list_price'] = product['price'] item['current_price'] = product['price'] item['desc'] = sel.xpath( '//div[@id="drawerDescription"]/div/div').extract()[0] item['cover'] = sel.xpath('//meta[@property="og:image"]/@content' ).extract()[0] + self.cover_img_surffix if product['stockStatus'] == 'out of stock': return skus = [] sizes = [] if not sel.xpath( '//select[@id="ctlSkuGroupType1_selItemList"]/option'): item['show_product_id'] = product['sku'] skuItem = SkuItem() skuItem['type'] = "sku" skuItem['from_site'] = item['from_site'] skuItem['size'] = 'One Size' sizes = ['One Size'] skuItem['color'] = 'One Color' skuItem['id'] = item['show_product_id'] + '-' + skuItem[ 'color'] + '-' + skuItem['size'] skuItem['show_product_id'] = item['show_product_id'] skuItem['current_price'] = item['current_price'] skuItem['list_price'] = item['list_price'] skus.append(skuItem) else: item['show_product_id'] = product['groupSku'] size_options = sel.xpath( '//select[@id="ctlSkuGroupType1_selItemList"]/option') for size_option in size_options: skuItem = SkuItem() skuItem['type'] = "sku" skuItem['from_site'] = item['from_site'] if not size_option.xpath('./text()').extract(): skuItem['size'] = 'One Size' else: skuItem['size'] = size_option.xpath( './text()').extract()[0] sizes.append(skuItem['size']) skuItem['color'] = 'One Color' skuItem['id'] = item['show_product_id'] + '-' + skuItem[ 'color'] + '-' + skuItem['size'] skuItem['show_product_id'] = item['show_product_id'] skuItem['current_price'] = item['current_price'] skuItem['list_price'] = item['list_price'] skus.append(skuItem) images = [] imageItem = ImageItem() imageItem['thumbnail'] = sel.xpath( '//meta[@property="og:image"]/@content').extract( )[0] + self.large_img_surffix imageItem['image'] = sel.xpath('//meta[@property="og:image"]/@content' ).extract()[0] + self.large_img_surffix images.append(imageItem) color = Color() color['type'] = 'color' color['from_site'] = item['from_site'] color['show_product_id'] = item['show_product_id'] color['images'] = images color['name'] = 'One Color' color['cover'] = item['cover'] yield color item['colors'] = ['One Color'] item['sizes'] = sizes item['skus'] = skus item['dimensions'] = ['size'] yield item
def handle_parse_item(self, response, item): sel = Selector(response) product_id_var = re.search(r'var pr_page_id[\s]*=[\s]*([^;]+);', response.body) if not product_id_var: return # if len(sel.xpath('//div[@class="outOfStock outOfStockSpecial"]')) > 0: # return product_id = eval(product_id_var.group(1)) item['show_product_id'] = product_id templateid = sel.xpath( '//div[@id="templateOption"]/@templateid').extract() if len(templateid) > 0: templateid = templateid[0] color_rows = sel.xpath('//tr[contains(@class, "diaperItemTR")]') clothing_shoes_products = sel.xpath( '//div[contains(@class, "clothingShoesProducts")]') if templateid == '6': '''''' colorItems = {} colorNames = [] oneColorDiv = sel.xpath('//div[@id="oneSelection"]') if len(oneColorDiv.extract()) > 0: colorItem = Color() color_cover = oneColorDiv.xpath(".//img/@src").extract()[0] if re.match(r'^//', color_cover): color_cover = 'https:' + color_cover color_name = oneColorDiv.xpath("text()").extract()[0] colorItem["cover"] = color_cover colorItem["name"] = color_name colorItem['type'] = 'color' colorItem['from_site'] = self.from_site colorItems[color_name] = {"item": colorItem, "handled": False} colorNames.append(color_name) else: colorDivs = sel.xpath('//ul[@id="falvorDrownList"]/li') #colorDivs = sel.xpath('//div[contains(@class, "clothingShoesProducts")]/div[contains(@class, "clothProductItem")]//div[contains(@class, "colorPaneItems")]'); for colorDiv in colorDivs: colorItem = Color() color_cover = colorDiv.xpath(".//img/@src").extract()[0] if re.match(r'^//', color_cover): color_cover = 'https:' + color_cover color_name = colorDiv.xpath("@id").extract()[0] colorItem["cover"] = color_cover colorItem["name"] = color_name colorItem['type'] = 'color' colorItem['from_site'] = self.from_site colorItems[color_name] = { "item": colorItem, "handled": False } colorNames.append(color_name) skuHiddens = sel.css('.multiItemBox').xpath( './/input[@class="skuHidden"]' ) #sel.xpath('//div[@id="clothItem"]//input[@class="skuHidden"]') skus = [] sizes = [] if len(skuHiddens) > 0: for skuHidden in skuHiddens: if skuHidden.xpath('@value').extract()[0] != "": skuItem = SkuItem() skuItem['show_product_id'] = item['show_product_id'] regular_price = skuHidden.xpath( '@regularprice').extract() price = skuHidden.xpath('@price').extract()[0] skuid = skuHidden.xpath('@value').extract()[0] is_outof_stock = skuHidden.xpath( '@isoutofstock').extract()[0] skuItem['id'] = skuid if len(regular_price) > 0 and regular_price[0] != '': skuItem["list_price"] = regular_price[0] else: skuItem["list_price"] = price skuItem['current_price'] = price replace_skuid = skuid.replace('-', '_') size = sel.xpath( '//ul[@id="diaperItemTR' + replace_skuid + '"]//li[@class="itemSize"]/span/text()').extract( )[0].strip() color_name = sel.xpath('//ul[@id="diaperItemTR' + replace_skuid + '"]/@primaryattr').extract()[0] skuItem['size'] = size if not size in sizes: sizes.append(size) skuItem['color'] = color_name skuItem[ 'is_outof_stock'] = self.change_out_of_stock_str( is_outof_stock) skuItem['type'] = 'sku' skuItem['from_site'] = self.from_site #yield skuItem skus.append(skuItem) if colorItems[color_name]["handled"] == False: colorItems[color_name]["handled"] = True url = 'https://www.diapers.com/product/productDetail!GetSkuImage.qs?skuCode=' + skuid + '&random=' + '%f' % random( ) yield Request(url, meta={ 'item': colorItems[color_name]['item'], 'show_product_id': item['show_product_id'] }, callback=self.parse_image) item["skus"] = skus item['sizes'] = sizes item['colors'] = colorNames elif len(color_rows.extract()) > 0: '''只有一个尺寸表格''' skus = [] colorNames = [] for color_row in color_rows: colorItem = Color() colorItem['from_site'] = self.from_site colorItem['type'] = 'color' skuHidden = color_row.xpath('.//input[@class="skuHidden"]') skuItem = SkuItem() skuItem['show_product_id'] = item['show_product_id'] regular_price = skuHidden.xpath('@regularprice').extract() price = skuHidden.xpath('@price').extract()[0] skuid = skuHidden.xpath('@value').extract()[0] is_outof_stock = skuHidden.xpath('@isoutofstock').extract()[0] skuItem['id'] = skuid # print response.url # print is_outof_stock # print skuHidden.extract() if len(regular_price) > 0 and regular_price[0] != '': skuItem["list_price"] = regular_price[0] else: skuItem["list_price"] = price skuItem['current_price'] = price color_cover = color_row.xpath( 'td[@class="itemImage"]/span[contains(@class, "itemImageDiv")]/img/@src' ).extract()[0] if re.match(r'^//', color_cover): color_cover = 'https:' + color_cover colorItem['cover'] = color_cover color_name = color_row.xpath( 'td[contains(@class, "Description")]/text()').extract() if len(color_name) > 0: color_name = color_name[0].strip() else: color_name = color_row.xpath( 'td[contains(@class, "elseDescription")]/text()' ).extract() if len(color_name) > 0: color_name = color_name[0].strip() else: color_name = color_row.xpath( 'td[contains(@class, "itemDescription")]/text()' ).extract() if len(color_name) > 0: color_name = color_name[0].strip() if not color_name: color_name = 'onecolor' colorItem['name'] = color_name colorNames.append(color_name) skuItem['color'] = color_name skuItem['size'] = 'onesize' skuItem['is_outof_stock'] = self.change_out_of_stock_str( is_outof_stock) skuItem['type'] = 'sku' skuItem['from_site'] = self.from_site #yield skuItem skus.append(skuItem) url = 'https://www.diapers.com/product/productDetail!GetSkuImage.qs?skuCode=' + skuid + '&random=' + '%f' % random( ) yield Request(url, meta={ 'item': colorItem, 'show_product_id': item['show_product_id'] }, callback=self.parse_image) item["skus"] = skus item['sizes'] = ['onesize'] item['colors'] = colorNames elif len(clothing_shoes_products) > 0: '''最常见的格式''' colorDivs = sel.xpath( '//div[contains(@class, "clothingShoesProducts")]/div[contains(@class, "clothProductItem")]//div[contains(@class, "colorPaneItems")]' ) colorItems = {} colorNames = [] for colorDiv in colorDivs: colorItem = Color() if len(colorDiv.xpath('./img').extract()) > 0: color_cover = colorDiv.xpath("./img/@src").extract()[0] if re.match(r'^//', color_cover): color_cover = 'https:' + color_cover color_name = colorDiv.xpath("./img/@color").extract()[0] colorItem["cover"] = color_cover elif len(colorDiv.xpath("./div/@style").extract()) > 0: cover_style = colorDiv.xpath("./div/@style").extract()[0] color_name = colorDiv.xpath("./div/@color").extract()[0] if 'background:' in cover_style: cover_style = re.search('background:([^;]+)', cover_style).group(1) colorItem["cover_style"] = cover_style else: return colorItem["name"] = color_name colorItem['type'] = 'color' colorItem['from_site'] = self.from_site colorItems[color_name] = {"item": colorItem, "handled": False} colorNames.append(color_name) skuHiddens = sel.xpath( '//div[@id="clothItem"]//input[@class="skuHidden"]') skus = [] sizes = [] if len(skuHiddens) > 0: for skuHidden in skuHiddens: if skuHidden.xpath('@value').extract()[0] != "": skuItem = SkuItem() skuItem['show_product_id'] = item['show_product_id'] regular_price = skuHidden.xpath( '@regularprice').extract() price = skuHidden.xpath('@price').extract()[0] skuid = skuHidden.xpath('@value').extract()[0] is_outof_stock = skuHidden.xpath( '@isoutofstock').extract()[0] skuItem['id'] = skuid if len(regular_price) > 0 and regular_price[0] != '': skuItem["list_price"] = regular_price[0] else: skuItem["list_price"] = price skuItem['current_price'] = price size = sel.xpath( '//div[@id="clothItem"]//input[@sku="' + skuid + '"]/@value').extract()[0] color_name = sel.xpath( '//div[@id="clothItem"]//input[@sku="' + skuid + '"]/@primaryattributevalue').extract()[0] skuItem['size'] = size if not size in sizes: sizes.append(size) skuItem['color'] = color_name skuItem[ 'is_outof_stock'] = self.change_out_of_stock_str( is_outof_stock) skuItem['type'] = 'sku' skuItem['from_site'] = self.from_site #yield skuItem skus.append(skuItem) if colorItems[color_name]["handled"] == False: colorItems[color_name]["handled"] = True url = 'https://www.diapers.com/product/productDetail!GetSkuImage.qs?skuCode=' + skuid + '&random=' + '%f' % random( ) yield Request(url, meta={ 'item': colorItems[color_name]['item'], 'show_product_id': item['show_product_id'] }, callback=self.parse_image) item["skus"] = skus item['sizes'] = sizes item['colors'] = colorNames elif len( sel.xpath( '//div[@id="templateOption"]/div[contains(@class, "colorSizeFirstStep")]//li[contains(@class, "colorPaneItems")]' ).extract()) > 0: '''判断是否是只有颜色没有尺寸的情况''' colorDivs = sel.xpath( '//div[@id="templateOption"]/div[contains(@class, "colorSizeFirstStep")]//li[contains(@class, "colorPaneItems")]' ) colorItems = {} colorNames = [] skus = [] for colorDiv in colorDivs: colorItem = Color() if len(colorDiv.xpath('./img').extract()) > 0: color_cover = colorDiv.xpath("./img/@src").extract()[0] if re.match(r'^//', color_cover): color_cover = 'https:' + color_cover color_name = colorDiv.xpath("./img/@color").extract()[0] colorItem["cover"] = color_cover else: cover_style = colorDiv.xpath("./div/@style").extract()[0] color_name = colorDiv.xpath("./div/@color").extract()[0] if 'background:' in cover_style: cover_style = re.search('background:([^;]+)', cover_style).group(1) colorItem["cover_style"] = cover_style colorItem["name"] = color_name colorItem['type'] = 'color' colorItem['from_site'] = self.from_site colorItems[color_name] = {"item": colorItem, "handled": False} colorNames.append(color_name) skuid = colorDiv.xpath('./img/@sku').extract()[0] skuHidden = sel.xpath('//input[@id="skuHidden' + skuid.replace('-', '_') + '"]') skuItem = SkuItem() skuItem['show_product_id'] = item['show_product_id'] regular_price = skuHidden.xpath('@regularprice').extract() price = skuHidden.xpath('@price').extract()[0] is_outof_stock = skuHidden.xpath('@isoutofstock').extract()[0] skuItem['id'] = skuid if len(regular_price) > 0 and regular_price[0] != '': skuItem["list_price"] = regular_price[0] else: skuItem["list_price"] = price skuItem['current_price'] = price size = 'onesize' skuItem['size'] = size skuItem['color'] = color_name skuItem['is_outof_stock'] = self.change_out_of_stock_str( is_outof_stock) skuItem['type'] = 'sku' skuItem['from_site'] = self.from_site #yield skuItem skus.append(skuItem) url = 'https://www.diapers.com/product/productDetail!GetSkuImage.qs?skuCode=' + skuid + '&random=' + '%f' % random( ) yield Request(url, meta={ 'item': colorItem, 'show_product_id': item['show_product_id'] }, callback=self.parse_image) item['sizes'] = ['onesize'] item["skus"] = skus item['colors'] = colorNames elif len( sel.xpath( '//div[@id="templateOption"]/div[contains(@class, "colorSizeFirstStep")]' ).extract()) > 0: colorItem = Color() # print sel.xpath('//div[@id="QtyInputDiv"]//li[contains(@class,"itemImage")]').extract() # color_cover = sel.xpath('//div[@id="QtyInputDiv"]//li[contains(@class,"itemImage")]//img/@src').extract()[0] color_name = 'onecolor' # colorItem['cover'] = color_cover colorItem['name'] = color_name colorItem['type'] = 'color' colorItem['from_site'] = self.from_site sku_sizes = sel.css('.colorSizeFirstStep .collectionSelections' ).xpath('./li/input') skus = [] sizes = [] colorItemSku = '' for sku_size in sku_sizes: '''''' skuItem = SkuItem() skuItem['show_product_id'] = item['show_product_id'] skuid = sku_size.xpath('@sku').extract()[0] skuHidden = sel.xpath('//input[@id="skuHidden' + skuid.replace('-', '_') + '"]') regular_price = skuHidden.xpath('@regularprice').extract() price = skuHidden.xpath('@price').extract()[0] is_outof_stock = skuHidden.xpath('@isoutofstock').extract()[0] skuItem['id'] = skuid if len(regular_price) > 0 and regular_price[0] != '': skuItem["list_price"] = regular_price[0] else: skuItem["list_price"] = price skuItem['current_price'] = price size = sel.xpath('//img[@id="' + skuid.replace('-', '_') + 'ColorButton"]/@colorvalue').extract()[0] skuItem['size'] = size if not size in sizes: sizes.append(size) skuItem['color'] = color_name skuItem['is_outof_stock'] = self.change_out_of_stock_str( is_outof_stock) skuItem['type'] = 'sku' skuItem['from_site'] = self.from_site #yield skuItem if colorItemSku == '': colorItemSku = skuid color_cover = sel.xpath( '//img[@id="' + skuid.replace('-', '_') + 'ColorButton"]/@imgsrc').extract()[0] if re.match(r'^//', color_cover): color_cover = 'https:' + color_cover colorItem['cover'] = color_cover skus.append(skuItem) item["skus"] = skus item['sizes'] = sizes item["colors"] = ['onecolor'] url = 'https://www.diapers.com/product/productDetail!GetSkuImage.qs?skuCode=' + colorItemSku + '&random=' + '%f' % random( ) yield Request(url, meta={ 'item': colorItem, 'show_product_id': item['show_product_id'] }, callback=self.parse_image) elif templateid == '10': # primary_attr = sel.xpath('//table[@id="primaryAttributeList"]/@attributename') # primary_attr = primary_attr.extract()[0].strip().lower() if len(primary_attr)>0 else '' # # second_attr = sel.xpath('//table[@id="secondAttributeList"]/@attributename') # second_attr = second_attr.extract()[0].strip().lower() if len(second_attr)>0 else '' # # third_attr = sel.xpath('//table[@id="thirdAttributeList"]/@attributename') # third_attr = third_attr.extract()[0].strip().lower() if len(third_attr)>0 else '' # # item['dimensions'] = [dimension for dimension in [primary_attr,second_attr,third_attr] if dimension] # # primary_attr_ids = sel.xpath('//table[@id="primaryAttributeList"]/tr/@id').extract() # second_attr_ids = sel.xpath('//table[@id="secondAttributeList"]/tr/@id').extract() # third_attr_ids = sel.xpath('//table[@id="thirdAttributeList"]/tr/@id').extract() # # primary_attr_dict = {} # second_attr_dict = {} # third_attr_dict = {} # item['sizes'] = {} skus = [] # if primary_attr and len(primary_attr_ids) > 0: # item['sizes'][primary_attr] = [] # for primary_attr_id in primary_attr_ids: # primary_attr_value = sel.xpath('//table[@id="primaryAttributeList"]/tr[@id="' + str(primary_attr_id) + '"]/td[@class="attributeValue "]/b/text()').extract()[0] # item['sizes'][primary_attr].append(primary_attr_value) # primary_attr_dict[primary_attr_id] = primary_attr_value # # if second_attr and len(second_attr_ids) > 0: # item['sizes'][second_attr] = [] # for second_attr_id in second_attr_ids: # second_attr_value = sel.xpath('//table[@id="secondAttributeList"]/tr[@id="' + str(second_attr_id) + '"]/td[@class="attributeValue "]/b/text()').extract()[0] # item['sizes'][second_attr].append(second_attr_value) # second_attr_dict[second_attr_id] = second_attr_value # # if third_attr and len(third_attr_ids) > 0: # item['sizes'][third_attr] = [] # for third_attr_id in third_attr_ids: # third_attr_value = sel.xpath('//table[@id="thirdAttributeList"]/tr[@id="' + str(third_attr_id) + '"]/td[@class="attributeValue "]/b/text()').extract()[0] # item['sizes'][third_attr].append(third_attr_value) # third_attr_dict[third_attr_id] = third_attr_value # if 'sizes' not in item['sizes'].keys(): # item['sizes']['size'] = ['One Size'] # if 'color' not in item['sizes'].keys(): # item['colors'] = ['One Color'] product_json_str = re.search('var pdpOptionsJson\s*=\s*([^\n]+);', response.body) if product_json_str: product_json_str = product_json_str.group(1) product_json = json.loads(product_json_str) colors = [] for sku_json in product_json: skuItem = SkuItem() skuItem['id'] = sku_json['Sku'] skuItem['show_product_id'] = item['show_product_id'] skuItem['size'] = {} skuItem['color'] = sku_json['Description'] skuItem['size'] = 'One Size' colors.append(skuItem['color']) # if primary_attr and len(primary_attr_dict) > 0 and len(sku_json['PrimaryAttributeValue'])>0: # skuItem['size'][primary_attr] = primary_attr_dict[sku_json['PrimaryAttributeValue']] # # if second_attr and len(second_attr_dict) > 0 and len(sku_json['SecondAttributeValue'])>0: # skuItem['size'][second_attr] = second_attr_dict[sku_json['SecondAttributeValue']] # # if third_attr and len(third_attr_dict) > 0 and len(sku_json['ThirdAttributeValue'])>0: # skuItem['size'][third_attr] = third_attr_dict[sku_json['ThirdAttributeValue']] # if len(skuItem['size']) == 0: # skuItem['size'] = 'One Size' # skuItem['color'] = 'One Color' # else: # if 'size' not in skuItem['size'].keys(): # skuItem['size'] = 'One Size' # if 'color' not in skuItem['size'].keys(): # skuItem['color'] = 'One Color' skuItem['current_price'] = sku_json['RetailPrice'] if not sku_json['RegularPrice']: skuItem["list_price"] = sku_json['RetailPrice'] else: skuItem['list_price'] = sku_json['RegularPrice'] skuItem['is_outof_stock'] = self.change_out_of_stock_str( sku_json['IsOutOfStock']) skuItem['type'] = 'sku' skuItem['from_site'] = self.from_site skus.append(skuItem) colorItem = Color() colorItem['name'] = skuItem['color'] colorItem['type'] = 'color' colorItem['from_site'] = self.from_site color_url = 'https://www.diapers.com/product/productDetail!GetSkuImage.qs?skuCode=' + skuItem[ "id"] + '&random=' + '%f' % random() yield Request(color_url, meta={ 'item': colorItem, 'show_product_id': item['show_product_id'] }, callback=self.parse_image) item['skus'] = skus item['colors'] = colors item['sizes'] = ['One Size'] else: '''告警''' # print response.meta['url'] return desc = sel.css('.descriptTabContent').xpath( "//div[@class='pIdDesContent']").extract() if len(desc) > 0: item['desc'] = desc[0] else: item['desc'] = '' '''handle size info''' size_chart_type = re.search(r'var sizeChartType[\s]*=[\s]*"([^"]+)";', response.body) size_info_brand_name = re.search(r'var brandName[\s]*=[\s]*"([^"]+)";', response.body) if size_chart_type and size_info_brand_name: size_info_brand_name = size_info_brand_name.group(1) size_chart_type = size_chart_type.group(1) item['size_info'] = { 'brand_name': size_info_brand_name, 'size_chart_type': size_chart_type } '''size info chart url''' size_info_chart_url = 'https://www.diapers.com/Product/BrandSizeChartHopup.qs?brandName=' + quote( size_info_brand_name) + '&sizeChartType=' + quote( size_chart_type) #print size_info_chart_url yield Request(size_info_chart_url, meta={ 'brand_name': size_info_brand_name, 'size_chart_type': size_chart_type }, callback=self.parse_size_info) yield item
def handle_parse_item(self, response, item): sel = Selector(response) addToCardFrom = sel.xpath('//form[@name="addToCart"]').extract() if len(addToCardFrom) == 0: return else: if re.search(r'SOLD OUT', addToCardFrom[0]): return if 'list_price' not in item.keys(): price_p = sel.xpath( '//div[1]/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/table[1]/tr/td[1]/table/tr/td/p[last()]' ) current_price = price_p.xpath('font[1]/text()').re( r'Now:\s*\$([\d.]+)') if len(current_price) > 0: current_price = current_price[0] else: if len( price_p.xpath('font[1]/text()').re( r'Sale:\s*\$([\d.]+)')) > 0: current_price = price_p.xpath('font[1]/text()').re( r'Sale:\s*\$([\d\.]+)')[0] else: current_price = sel.xpath( '//div[1]/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/table[1]/tr/td[1]/table/tr/td/font[1]/text()' ).extract()[0] if 'Sale:' in current_price: current_price = current_price.replace('Sale:', '') elif 'Today:' in current_price: current_price = current_price.replace('Today:', '') list_price_p = price_p.xpath( 'font[@class="strike"]/font/text()').extract() if len(list_price_p) > 0: list_price = list_price_p[0].strip() else: price_p_html = price_p.extract()[0] list_price_match = re.search(r'Orig:\s*\$([\d\.]+)', price_p_html) if list_price_match is None: list_price = sel.xpath( '//div[1]/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/table[1]/tr/td[1]/table/tr/td/font[2]/text()' ).re(r'Orig:\s*\$([\d\.]+)')[0] else: list_price = re.search(r'Orig:\s*\$([\d\.]+)', price_p_html).group(1).strip() item['list_price'] = list_price item['current_price'] = current_price brand = sel.xpath( '//div/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/table[1]/tr/td[1]/table/tr/td/div/a/text()' ) if len(brand) > 0: brand = brand.re(r'About the\s*(.+)\s+Brand$')[0] else: brand = 'New Balance' description = sel.xpath( '//div[1]/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/div[1]/table/tr[1]/td/text()' ).extract() if len(description) > 0: description = '<div>' + description[0] + '</div>' elif len( sel.xpath( '//div[1]/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/div[1]/table/tr[1]/td/p/text()' ).extract()) > 0: description = '<div>' + sel.xpath( '//div[1]/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/div[1]/table/tr[1]/td/p/text()' ).extract()[0] + '</div>' else: description = '' description2 = sel.xpath( '//div[1]/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/div[2]/table/tr[1]/td/ul' ).extract() if len(description2) > 0: description = '<div>' + description2[0] + '</div>' if len(description) == 0: description = '暂无' style_category_font = sel.xpath( '//div[1]/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/table[1]/tr/td[1]/table/tr/td/p[1]/font[last()]/text()' ) show_product_id = style_category_font.re(r'Style:\s*(.+)')[0] category = style_category_font.extract()[1] select_options = sel.xpath( '//div[1]/div/div/table/tr[2]/td/table[3]/tr/td[2]/table[3]/tr/td[2]/table[1]/tr/td[3]/form/table[1]/tr[2]/td/select/option' ) sizes = [] skus = [] for select_option in select_options: if select_option.xpath('@value').extract_first() == 'select': continue size = select_option.xpath('text()').extract_first() size = size.replace(u'\xa0', '').encode('utf-8') sku_item = SkuItem() id = select_option.xpath('@value') if len(id) > 0: id = id.extract()[0] else: id = size sku_item['type'] = 'sku' sku_item['show_product_id'] = show_product_id sku_item['from_site'] = item['from_site'] sku_item['id'] = id sku_item['list_price'] = item['list_price'] sku_item['current_price'] = item['current_price'] sku_item['size'] = size sku_item['color'] = 'onecolor' sku_item['is_outof_stock'] = False sizes.append(size) skus.append(sku_item) item['skus'] = skus item['sizes'] = {'size': sizes} item['dimensions'] = ['size'] item['colors'] = ['onecolor'] item['brand'] = brand item['desc'] = description item['show_product_id'] = show_product_id if 'category' not in item.keys(): item['category'] = category yield item color_url = self.base_url + 'larger_view.asp?style=' + show_product_id yield Request(color_url, callback=self.parse_color, meta={'item': item})
def handle_parse_item(self, response, item): sel = Selector(response) if len(sel.xpath(".//div[@class='atg_store_noMatchingItem']")) > 0: return info = sel.xpath(".//div[@class='firstContainer row']") item['brand'] = info.xpath("./h1/a[1]/text()").extract()[0] item['show_product_id'] = info.xpath( "./h1/h2/text()").extract()[0].strip() item['title'] = info.xpath("./h1/a[2]/text()").extract()[0] # item['desc'] = info.xpath("./h3/text()").extract()[0] item['colors'] = [] if len(sel.xpath(".//div[@id='tab1_info']")) > 0: if len(sel.xpath(".//div[@id='tab1_info']/div[2]")) > 0: item['desc'] = sel.xpath( ".//div[@id='tab1_info']/div[1]/table").extract( )[0] + sel.xpath( ".//div[@id='tab1_info']/div[2]/table").extract()[0] else: item['desc'] = sel.xpath( ".//div[@id='tab1_info']/div[1]/table").extract()[0] skusStr = "".join( re.findall(r'window.universal_variable =.+\}\}<\/script>', response.body, re.S)) if len(skusStr) > 0: context = execjs.compile(''' var skus = %s; function getSkus(){ return skus; } ''' % skusStr[27:-9]) skusDict = context.call('getSkus') item['list_price'] = skusDict['product']['unit_price'] item['current_price'] = skusDict['product']['unit_sale_price'] images = [] imageDom = sel.xpath(".//ul[@class='alt_imgs col-md-12']/li") colorItem = Color() for dom in imageDom: imageItem = ImageItem() imageItem['image'] = self.base_url + dom.xpath( "./a/@href").extract()[0] imageItem['thumbnail'] = re.sub('XA\.', 'LA.', imageItem['image']) images.append(imageItem.copy()) colorItem['images'] = images colorItem['type'] = 'color' colorItem['from_site'] = item['from_site'] colorItem['show_product_id'] = item['show_product_id'] colorItem['name'] = u'one color' colorItem['cover'] = self.base_url + sel.xpath( ".//ul[@class='alt_imgs col-md-12']/li[1]/a/img/@src").extract()[0] yield colorItem item['colors'].append(colorItem['name']) item['dimensions'] = ['size'] item['skus'] = [] sku_item_url_list = [] sku_size_list = [] index = 0 sku_items = sel.xpath(".//div[@id='sizeValues']/div") if len(sku_items) > 0: for sku_item in sku_items: sku_size = sku_item.xpath("./@onclick").extract()[0].split( "'")[3] ajax_id = sku_item.xpath("./@onclick").extract()[0].split( "'")[1] if sku_size.find(' ') != -1: sku_size = re.sub(' ', '%20', sku_size) sku_item_url = self.base_url + sel.xpath( ".//form[@id='colorsizerefreshform']/@action" ).extract( )[0] + '&productId=' + ajax_id + '&selectedSize=' + sku_size sku_item_url_list.append(sku_item_url) sku_size_list.append(sku_size) sku_item_url_list.append( sku_item_url ) # only for avoiding indexError in parse_sku_item when loop reach the last size yield Request(sku_item_url_list[0], callback=self.parse_sku_item, meta={ "sku_size_list": sku_size_list, "sku_item_url_list": sku_item_url_list, "item": item, "index": index }) else: skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = skusDict['product']['id'] skuItem['list_price'] = item['list_price'] skuItem['current_price'] = item['current_price'] skuItem['color'] = u'one color' skuItem['size'] = u'one size' skuItem['id'] = skusDict['product']['sku_code'] skuItem['from_site'] = item['from_site'] if skusDict['product']['stock'] == 0: skuItem['is_outof_stock'] = True item['skus'].append(skuItem) item['sizes'] = [u'one size'] yield item
def handle_parse_item(self, response, baseItem): sel = Selector(response) product_id = sel.xpath('//div[@id="productId"]/text()').extract()[0] baseItem['gender'] = 'men' baseItem['type'] = 'base' baseItem['from_site'] = self.name baseItem['show_product_id'] = product_id baseItem['title'] = sel.xpath( '//span[@class="row product-title"]/text()').extract()[0].strip() size_fit_container = sel.xpath('//div[@id="sizeFitContainer"]') if len(size_fit_container) > 0: size_fit = size_fit_container.extract()[0] baseItem['desc'] = '<div>' + sel.xpath( '//div[@itemprop="description"]').extract( )[0] + size_fit + "</div>" else: baseItem['desc'] = sel.xpath( '//div[@itemprop="description"]').extract()[0] baseItem['dimensions'] = ['size', 'color'] skus = [] product_detail_str = "".join( re.findall(r"var\s+productDetail[^;]+", response.body)) if len(product_detail_str) > 0: context = execjs.compile(''' %s function get_product_detail(){ return productDetail; } ''' % (product_detail_str)) product_detail = context.call('get_product_detail') size_js_infos = product_detail['sizes'] size_infos = {} size_values = [] for size_id in size_js_infos: size_infos[size_js_infos[size_id]['sizeCode']] = size_id size_values.append(size_id) list_price = sel.xpath( '//div[@id="productPrices"]//meta[@itemprop="price"]/@content' ).extract()[0] color_price_blocks = sel.xpath( '//div[@id="productPrices"]//div[@class="priceBlock"]') # color_price_mapping = {} # for color_price_block in color_price_blocks: # color_name = color_price_block.xpath( # './span[@class="priceColors"]/text()').extract() # if len(color_name) > 0: # regular_price_span = color_price_block.xpath( # './span[@class="regularPrice"]/text()').extract() # if len(regular_price_span) > 0: # color_price_mapping[color_name[0]] = regular_price_span[0] # else: # color_price_mapping[color_name[0]] = color_price_block.xpath( # './span[@class="salePrice"]/text()').extract()[0] match = re.search(r'productPage\.sellingPrice\=\'([\d\.]+)\';', response.body) if match is None: current_price = list_price else: current_price = match.group(1) image_items = product_detail['colors'] color_names = [] for key in image_items: imageItems = image_items[key]['images'] color_name = image_items[key]['colorName'].strip() + '-' + str(key) color_names.append(color_name) images = [] tmp_images = [] for image_key in imageItems: imageItem = ImageItem() image = imageItems[image_key] imageItem['thumbnail'] = image['thumbnail'] imageItem['image'] = image['zoom'] tmp_images.append((image['index'], imageItem)) tmp_images = sorted(tmp_images, key=lambda x: x[0]) for tmp_tuple in tmp_images: images.append(tmp_tuple[1]) colorItem = Color() colorItem['type'] = 'color' colorItem['show_product_id'] = baseItem['show_product_id'] colorItem['from_site'] = self.name colorItem['cover'] = image_items[key]['swatch'] colorItem['name'] = color_name colorItem['images'] = images yield colorItem sizes = image_items[key]['sizes'] for size in sizes: size_name = size_infos[size] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['from_site'] = self.name skuItem['color'] = color_name skuItem['show_product_id'] = baseItem['show_product_id'] skuItem['id'] = key + "-" + size skuItem['size'] = size_name skuItem['list_price'] = list_price skuItem['current_price'] = current_price # if len(color_price_mapping) > 0 and color_name in color_price_mapping.keys(): # skuItem['current_price'] = color_price_mapping[ # colorItem['name']] # else: # skuItem['current_price'] = skuItem['list_price'] skuItem['is_outof_stock'] = False skus.append(skuItem) baseItem['sizes'] = size_values baseItem['colors'] = color_names baseItem['skus'] = skus product_items = sel.xpath( '//ul[@id="similarities"]/li[@class="product"]') if len(product_items) > 0: related_items_id = [] for product_item in product_items: product_id = product_item.xpath( './div/div[@class="info"]/img/@data-product-id').extract( )[0] related_items_id.append(product_id) if related_items_id: baseItem['related_items_id'] = related_items_id yield baseItem
def handle_parse_item(self, response, baseItem): product_detail_str="".join(re.findall(r"var\s+productDetail[^;]+", response.body)) if len(product_detail_str)>0: context = execjs.compile(''' %s function get_product_detail(){ return productDetail; } ''' % (product_detail_str)) product_detail = context.call('get_product_detail') sel = Selector(response) product_id = sel.xpath('//div[@id="productId"]/text()').extract()[0] skus = [] baseItem['from_site'] = self.name baseItem['show_product_id'] = product_id size_js_infos = product_detail['sizes'] size_infos = {} size_values = [] for size_id in size_js_infos: size_infos[size_js_infos[size_id]['sizeCode']] = size_id size_values.append(size_id) list_price = sel.xpath('//div[@id="productPrices"]//meta[@itemprop="price"]/@content').extract()[0] color_price_blocks = sel.xpath('//div[@id="productPrices"]//div[@class="priceBlock"]') color_price_mapping = {} for color_price_block in color_price_blocks: color_name = color_price_block.xpath('./span[@class="priceColors"]/text()').extract() if len(color_name) > 0: regular_price_span = color_price_block.xpath('./span[@class="regularPrice"]/text()').extract() if len(regular_price_span) > 0: color_price_mapping[color_name[0]] = regular_price_span[0] else: color_price_mapping[color_name[0]] = color_price_block.xpath('./span[@class="salePrice"]/text()').extract()[0] image_items = product_detail['colors'] color_names = [] for key in image_items: imageItems = image_items[key]['images'] color_name = image_items[key]['colorName'].strip() color_names.append(color_name) images=[] tmp_images = [] for image_key in imageItems: imageItem = ImageItem() image = imageItems[image_key] imageItem['thumbnail'] = image['thumbnail'] imageItem['image'] = image['zoom'] tmp_images.append((image['index'], imageItem)) tmp_images = sorted(tmp_images, key=lambda x:x[0]) for tmp_tuple in tmp_images: images.append(tmp_tuple[1]) colorItem = Color() colorItem['type'] = 'color' colorItem['show_product_id'] = product_id colorItem['from_site'] = self.name colorItem['cover'] = image_items[key]['swatch'] colorItem['name'] = color_name colorItem['images'] = images yield colorItem sizes = image_items[key]['sizes'] for size in sizes: size_name = size_infos[size] skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['from_site'] = self.name skuItem['color'] = color_name skuItem['show_product_id'] = product_id skuItem['id'] = key+"-"+size skuItem['size'] = size_name skuItem['list_price'] = list_price if len(color_price_mapping)>0 and color_name in color_price_mapping.keys(): # skuItem['current_price'] = sale_price_span.re(r'\d+.?\d*')[0] skuItem['current_price'] = color_price_mapping[colorItem['name']] else: skuItem['current_price'] = skuItem['list_price'] skuItem['is_outof_stock'] = False skus.append(skuItem) baseItem['sizes'] = size_values baseItem['colors']= color_names baseItem['skus'] = skus size_fit_container = sel.xpath('//div[@id="sizeFitContainer"]') if len(size_fit_container)>0: size_fit = size_fit_container.extract()[0] baseItem['desc'] = '<div>'+sel.xpath('//div[@itemprop="description"]').extract()[0]+size_fit+"</div>" else: baseItem['desc'] = sel.xpath('//div[@itemprop="description"]').extract()[0] baseItem['dimensions'] = ['size', 'color'] yield baseItem
def handle_parse_item(self, response, item): pImgStr = "".join(re.findall(r'(pImgs[^;]+;)+', response.body)) context = execjs.compile(''' %s function getPImgs(){ return pImgs; } ''' % pImgStr) pImgs = context.call('getPImgs') sel = Selector(response) outofstock_result = re.search(r'outOfStock[\s]*=[\s]*([^;]+);', response.body) if outofstock_result and outofstock_result.group(1) == 'true': return stock_json_result = re.search(r'var stockJSON[\s]*=[\s]*([^;]+);', response.body) if stock_json_result: stock_dic = eval(stock_json_result.group(1)) if stock_dic: color_price_dic = eval( re.search(r'colorPrices[\s]*=[\s]*([^;]+);', response.body).group(1)) style_id_dic = eval( re.search(r'styleIds[\s]*=[\s]*([^;]+);', response.body).group(1)) product_gender = eval( re.search(r'productGender[\s]*=[\s]*([^;]+);', response.body).group(1)) zeta_categories = eval( re.search(r'zetaCategories[\s]*=[\s]*([^;]+);', response.body).group(1)) category = eval( re.search(r';[\s]*category[\s]*=[\s]*([^;]+);', response.body).group(1)) sub_category = eval( re.search(r'subCategory[\s]*=[\s]*("[^"]+"[\s]*);', response.body).group(1)) dimension_dic = eval( re.search(r'dimensions[\s]*=[\s]*([^;]+);', response.body).group(1)) #dimToUnitToValJSON = eval(re.search(r'dimToUnitToValJSON[\s]*=[\s]*([^;]+);', response.body).group(1)) dimensionIdToNameJson = eval( re.search(r'dimensionIdToNameJson[\s]*=[\s]*([^;]+);', response.body).group(1)) valueIdToNameJSON = eval( re.search(r'valueIdToNameJSON[\s]*=[\s]*([^;]+);', response.body).group(1)) colorNames = eval( re.search(r'colorNames[\s]*=[\s]*({[^}]+}[\s]*);', response.body).group(1)) if len(zeta_categories) > 0: item['product_type'] = zeta_categories[0].values()[0] if category == item['product_type']: item['category'] = sub_category else: item['category'] = category item['sub_category'] = sub_category else: item['product_type'] = category item['category'] = sub_category if 'gender' in response.meta.keys(): meta_gender = response.meta['gender'] if product_gender.lower == 'unisex': if meta_gender == 'boys' or meta_gender == 'girls': item['gender'] = 'kid-unisex' else: item['gender'] = 'unisex' else: item['gender'] = meta_gender '''跳过描述,过于复杂''' size_info_images = [] desc = sel.xpath( '//div[@id="productDescription"]//div[@itemprop="description"]/ul' ).extract() if len(desc) > 0: item['desc'] = desc[0] size_infos = sel.xpath( '//div[@id="productDescription"]//div[@itemprop="description"]/ul/li/a[@class="popup-570-550"]' ) if len(size_infos) > 0: size_info_images = [] for size_info in size_infos: size_info_image_url = size_info.xpath( '@href').extract()[0] if not re.match(r'^http:\/\/', size_info_image_url): size_info_image_url = self.base_url + size_info_image_url size_info_images.append(size_info_image_url) else: desc_ul = sel.xpath( '//div[@id="prdInfoText"]//span[@class="description summary"]/ul' ).extract() if len(desc_ul) == 0: return item['desc'] = desc_ul[0] size_infos = sel.xpath( '//div[@id="prdInfoText"]//span[@class="description summary"]/ul/li/a[@class="popup-570-550"]' ) if len(size_infos) > 0: size_info_images = [] for size_info in size_infos: size_info_image_url = size_info.xpath( '@href').extract()[0] if not re.match(r'^http:\/\/', size_info_image_url): size_info_image_url = self.base_url + size_info_image_url size_info_images.append(size_info_image_url) if len(size_info_images) > 0: item['size_info'] = {'images': size_info_images} colors = [] '''处理color''' for (color, color_name) in colorNames.items(): colorItem = Color() colorItem['type'] = 'color' colorItem['from_site'] = self.name colorItem['show_product_id'] = item['show_product_id'] colorItem['name'] = color_name colors.append(color_name) styleId = str(style_id_dic[color]) #colorItem['cover'] = sel.xpath('//a[@id="frontrow-'+color+'"]/img/@src').extract()[0] if 'p' in pImgs[styleId]['DETAILED'].keys(): colorItem['cover'] = pImgs[styleId]['DETAILED']['p'] elif 'd' in pImgs[styleId]['DETAILED'].keys(): colorItem['cover'] = pImgs[styleId]['DETAILED']['d'] elif '1' in pImgs[styleId]['MULTIVIEW_THUMBNAILS'].keys(): colorItem['cover'] = pImgs[styleId][ 'MULTIVIEW_THUMBNAILS']['1'] elif '4' in pImgs[styleId]['MULTIVIEW_THUMBNAILS'].keys(): colorItem['cover'] = pImgs[styleId][ 'MULTIVIEW_THUMBNAILS']['4'] elif '5' in pImgs[styleId]['MULTIVIEW_THUMBNAILS'].keys(): colorItem['cover'] = pImgs[styleId][ 'MULTIVIEW_THUMBNAILS']['5'] colorImages = pImgs[styleId] thumbImages = colorImages['MULTIVIEW_THUMBNAILS'] images = colorImages['2x'] if len(images) == 0: images = colorImages['MULTIVIEW'] thumbImages = sorted(thumbImages.iteritems(), key=lambda d: d[0]) images_array = [] for image_tuple in thumbImages: imageItem = ImageItem() if image_tuple[0] in images.keys(): imageItem['image'] = images[image_tuple[0]] imageItem['thumbnail'] = image_tuple[1] if image_tuple[0] == 'p' or image_tuple[0] == 'd': images_array.insert(0, imageItem) else: images_array.append(imageItem) colorItem['images'] = images_array yield colorItem item['colors'] = colors dimensions = [] sizes = {} for dimension in dimension_dic: dimensions.append(dimensionIdToNameJson[dimension]) sizes[dimensionIdToNameJson[dimension]] = [] if len(dimensions) == 0: dimensions = ['size'] if len(sizes) == 0: sizes = {'size': ['onesize']} item['dimensions'] = dimensions '''处理sku库存''' skuCollectionsList = [] for sku_stock in stock_dic: color = sku_stock['color'] if color in color_price_dic.keys(): skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['from_site'] = self.name skuItem['show_product_id'] = item['show_product_id'] skuItem['id'] = sku_stock['id'] skuItem["list_price"] = color_price_dic[color]['wasInt'] skuItem['current_price'] = color_price_dic[color]['nowInt'] skuItem['color'] = colorNames[color] size_demension = {} for demension in dimension_dic: if demension in sku_stock.keys( ) and sku_stock[demension] in valueIdToNameJSON.keys(): size_value = valueIdToNameJSON[ sku_stock[demension]]['value'] size_demension[ dimensionIdToNameJson[demension]] = size_value if not size_value in sizes[ dimensionIdToNameJson[demension]]: sizes[dimensionIdToNameJson[demension]].append( size_value) if len(size_demension) == 0: size_demension = {'size': 'onesize'} skuItem['size'] = size_demension skuItem['quantity'] = sku_stock['onHand'] skuItem['is_outof_stock'] = False skuCollectionsList.append(skuItem) item['skus'] = skuCollectionsList item['sizes'] = sizes item = self.handle_dimension_to_name(response, item, dimensionIdToNameJson) yield item
def parse_skus(self, response): item = response.meta['item'] images = response.meta['images'] yielded_coloritems = response.meta['yielded_coloritems'] if 'errorpage' in response.url: return body_json = json.loads(response.body) detail_str = body_json['ProductSizeAndColor'][ 'productSizeAndColorJSON'] detail_json = json.loads(detail_str) # if len(detail_json) >1: # print '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!multi skus: ', body_json['ProductSizeAndColor']['productIds'] + item['url'] color_names = [] sizes = [] item['skus'] = [] for skus_detail in detail_json: for sku_detail in skus_detail['skus']: if 'color' in sku_detail.keys(): if re.findall('\?.+\?false', sku_detail['color']): color_name = re.sub('\?.+\?false', '', sku_detail['color']) # color_name = sku_detail['color'].replace('?1?false', '').strip() elif '?' in sku_detail['color']: print sku_detail['color'] raise NameError('colorname error ' + sku_detail['color'] + ' ' + item['url']) else: color_name = sku_detail['color'].strip() else: color_name = 'One Color' if color_name not in color_names: color_names.append(color_name) if 'size' in sku_detail.keys() and sku_detail['size']: size = sku_detail['size'] else: size = 'One Size' if size not in sizes: sizes.append(size) skuItem = SkuItem() skuItem['type'] = 'sku' skuItem['show_product_id'] = item['show_product_id'] skuItem['list_price'] = item['list_price'] skuItem['current_price'] = item['current_price'] skuItem['color'] = color_name skuItem['size'] = size skuItem['id'] = sku_detail['sku'] skuItem['from_site'] = item['from_site'] skuItem['is_outof_stock'] = False if sku_detail['status'] != 'In Stock' and sku_detail[ 'status'] != 'InStock': print 'stock status: ', sku_detail['status'] # skuItem['is_outof_stock'] = True item['skus'].append(skuItem) if not yielded_coloritems: for color_name in color_names: colorItem = Color() colorItem['images'] = images colorItem['type'] = 'color' colorItem['from_site'] = item['from_site'] colorItem['show_product_id'] = item['show_product_id'] colorItem['name'] = color_name if not images: # raise Exception('no image url: ' + item['url']) return colorItem['cover'] = images[0]['thumbnail'] yield colorItem item['sizes'] = sizes item['colors'] = color_names yield item