def parse_detail_data(self, response: HtmlResponse): params = self.get_url_params(response.url) detail_data = self.get_detail_data(response) # 每种样式/颜色生成一个商品 sts = self.get_other_style(response) if not sts: return for code in sts.keys(): p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url) color_data = self.get_color_data(code, response) p['code'] = code p['name'] = detail_data['name'] gcl = params.get('ProductID', None) if not gcl: gcl = params.get('productid', None) p['group_code'] = gcl[0] p['category'] = detail_data['category'] p["price"] = float(color_data['price']) p['white_price'] = float(color_data['white_price']) p['img_urls'] = color_data['imgs'] p['size_select'] = color_data['sizes'] p['size_valid'] = color_data['ava_sizes'] p['desc'] = detail_data['desc'] p['detail'] = detail_data['detail'] p['delivery'] = detail_data['delivery'] p['tags'] = detail_data['tags'] p['other_style'] = self.get_other_style(response) p['raw_products'] = {} p['gender'] = self.get_gender(p['name'], p['tags']) yield p
def parse_detail_data(self, response: HtmlResponse): items = response.xpath('//div[@class="detail-row-r"]/div[@class="detail-list"]') p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url) cate = response.xpath('/html/head/title').extract_first() p['category'] = {'name': self.get_category(cate), 'href': ''} p['name'] = items.xpath('h5/text()').extract_first().strip() p['code'] = re.findall(r'product/detail/(\S+).html', response.url)[0] p['group_code'] = items[0].xpath('p/text()').extract_first().split(':')[-1] p['desc'] = items[1].xpath('p/text()').extract_first() p['detail'] = {'composition': items[2].xpath('p/text()').extract_first(), 'detailed': ''} p['raw_products'] = {} price = items[3].xpath('h1/text()').extract_first() if price: white_price = items[3].xpath('h3/del/text()').extract_first() else: white_price = items[3].xpath('h3/text()').extract_first() price = white_price p['price'] = float(re.search(r'\d+', price).group()) p['white_price'] = float(re.search(r'\d+', white_price).group()) p['img_urls'] = self.get_img_urls(response) p['other_style'] = self.get_other_style(items[4]) p['size_select'], p['size_valid'] = self.get_size_data(items[5], p['other_style'][p['code']]['color_code']) p['gender'] = self.get_gender(p['name']) return p
def parse_detail_data(self, response: HtmlResponse): data = json.loads(str(response.text)).get('data', None) if not data: return for color in data['color']: if color['status'] == 'OutShelf': continue source_url = "https://www.only.cn/goodsDetails.html?design="+data['projectCode'] p = HMProductItem(source_url=source_url) p['code'] = self.code_prefix + color['colorCode'] p['name'] = data['goodsName'] p['raw_products'] = data p['group_code'] = data['projectCode'] # only站只有女士服饰 p['category'] = {'name': self.WOMAN, 'href': ''} p["price"] = float(color['price']) p['white_price'] = float(color['originalPrice']) p['img_urls'] = self.get_img_urls(color) p['size_select'], p['size_valid'] = self.get_sizes(color, data['projectCode']) p['desc'] = data['describe'] p['detail'] = {'composition': '', 'detailed': data['goodsInfo']} p['delivery'] = '' p['tags'] = self.get_tags(color) p['other_style'] = self.get_other_style(data['color']) # only站只有女性服饰 p['gender'] = 'female' yield p self.log_record_after(response.url)
def parse_detail_data(self, response: HtmlResponse, code: str): data = self.get_spu_json(code) summary = data.get('summary', {}) rows = {} other_style = {} code = summary.get('productCode') for row in data.get('rows', []): let_code = f"{code}-{row.get('colorNo')}" if let_code not in rows: rows[let_code] = [] img = f'https://www.uniqlo.cn/hmall/test/{code}/sku/40/{row["colorNo"]}.jpg' other_style[let_code] = { 'color': row.get('style'), 'img': img, 'color_code': row.get('colorNo') } rows[let_code].append(row) stock = self.get_stock_json(response.url, code) price_data = self.get_price_json(code) img_data = self.get_img_json(code) price_dict = {row['productId']: row for row in price_data['rows']} ps = [] # 结果集 for let_code, row in rows.items(): product_id = row[0]['productId'] white_price = float(summary.get('originPrice', 0)) p = HMProductItem( html=response.text.encode('utf8', 'ignore'), source_url=response.url, name=summary.get('name'), code=let_code, group_code=code, raw_products=data, white_price=white_price, other_style=other_style, price=float( price_dict.get(product_id, {'price': white_price})['price'])) tags = [summary.get('gDeptValue'), row[0].get('style')] p['tags'] = [{'name': tag, 'href': ''} for tag in tags if tag] p['category'] = { 'name': self.get_category(response.url), 'href': '' } p['gender'] = self.get_gender(summary.get('name'), p['tags']) p['img_urls'] = self.get_images(img_data, row) p['size_select'], p['size_valid'] = self.get_size_data(row, stock) instruction = data.get('desc', {}).get('instruction', '') p['desc'] = BeautifulSoup(instruction).get_text() p['detail'] = {'composition': '', 'detailed': instruction} ps.append(p) self.log_record_after(response.url) return ps
def parse_detail_data(self, response: HtmlResponse): inputs = response.xpath('//div[@class="row float-clearfix"]') p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url) title_select = response.selector.xpath( '//div[@class="pdp-title none-sm"]') cate = response.xpath('/html/head/title').extract_first() # if tags and tags[0]: p['category'] = { 'name': self.get_category_by_url(cate), 'href': '' } # {'name': tags[0], 'href': ''} if title_select: title_select.xpath('div[@class="goods-tit"]') tags = title_select.xpath('div[@class="goods-tit"]/text()' ).extract_first().strip().split() p['tags'] = [{'name': tag, 'href': ''} for tag in tags if tag] p['name'] = inputs.xpath( 'input[@id="itemTitle"]/@value').extract_first() p['gender'] = self.get_gender(p['name'], p['tags']) p['code'] = inputs.xpath( 'input[@id="itemCode"]/@value').extract_first() p['group_code'] = inputs.xpath( 'input[@id="itemStyle"]/@value').extract_first() p['other_style'] = { li.attrib['code']: { 'color': '', 'color_code': li.attrib['itemstyle'], 'img': li.xpath('a/img/@src').extract_first() } for li in response.xpath('//ul[@id="itemColor"]/li') } p['raw_products'] = {} p['img_urls'] = self.get_images(response) p['size_select'], p['size_valid'] = self.get_size_data(response) p['white_price'] = float( inputs.xpath('input[@id="listPrice"]/@value').extract_first()) p['price'] = float( inputs.xpath('input[@id="salePrice"]/@value').extract_first()) p['desc'] = response.xpath( '//div[@class="large-box1"]/div/div[@class="float-left"]/p/text()' ).extract_first() p['detail'] = { 'composition': '', 'detailed': response.xpath('//div[@class="large-box1"]').extract_first() } return p
def parse_detail_data(self, response: HtmlResponse): tags = [] category = None i = 0 for tag in response.xpath( "//a[@itemprop='item']/span/text()").extract(): tag_info = {'name': tag, 'href': ''} if i == 1: category = tag_info i += 1 tags.append(tag_info) for tag in response.xpath("//title/text()").extract_first().split(): name = tag.strip() if name in ['-', '|', 'CN']: continue tags.append({'name': name, 'href': ''}) try: html = response.xpath('//main').get() except Exception as e: self.logger.exception(e) html = response.text p = HMProductItem(html=html, source_url=response.url) p['tags'] = tags p['category'] = {'name': self.get_category(response.url), 'href': ''} p['name'] = response.xpath( "//h1[@class='primary product-item-headline']/text()" ).extract_first().strip() p['gender'] = self.get_gender(p['name'], p['tags']) data = self.parse_product_data(response) if not data: return p['raw_products'] = data p['other_style'] = self.get_other_style(data) code = data['articleCode'] p['code'] = code p['group_code'] = code[:-3] p['img_urls'] = data[code]['images'] p['size_select'] = data[code]['sizes'] p['size_valid'] = self.hm_request_size_valid(response.url, code) white_price = data[code].get('whitePriceValue', '0') p['white_price'] = float(white_price) p['price'] = float(data[code].get('redPriceValue', white_price)) p['desc'] = data[code].get('description', '') p['detail'] = { 'composition': data[code].get('composition'), 'detailed': data[code].get('detailedDescriptions') } return p
def parse_detail_data(self, response: HtmlResponse): data = re.findall(r'window.INITIAL_REDUX_STATE=(.*?);</script>', response.text, re.S) if data and data[0]: try: product_data = json.loads(data[0]) self.log_record_after(response.url) except json.decoder.JSONDecodeError: self.log_record_after(response.url, info=data[0], error='json') return products = product_data.get('Threads', {}).get('products', {}) other_style = { k: { 'color': v.get('colorDescription'), 'img': v.get('firstImageUrl'), 'color_code': k.split('-')[-1] } for k, v in products.items() } items = [] for k, v in products.items(): p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url) p['name'] = v.get('fullTitle', '') p['code'] = k p['group_code'] = v.get('styleCode', '') p['raw_products'] = v p['other_style'] = other_style p['category'] = { 'name': self.get_category(response.url), 'href': '' } p['gender'] = self.get_gender(v.get('fullTitle', ''), [], response.url) p['img_urls'] = self.get_images(v['nodes'][0]['nodes']) p['size_select'] = self.get_size_data(v['skus']) p['size_valid'] = self.get_valid_size_data(v['availableSkus']) white_price = v.get('fullPrice', 0) p['white_price'] = float(white_price) p['price'] = float(v.get('currentPrice', white_price)) p['desc'] = v.get('descriptionPreview', '') p['detail'] = {'composition': '', 'detailed': v['description']} items.append(p) return items
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url) p['name'] = response.xpath('/html/head/title/text()').extract_first() p['gender'] = self.get_gender(p['name']) p['code'] = self.get_code(response.url)[0] p['group_code'] = self.get_code(response.url)[0] price = response.xpath( "//div[contains(@class,'product-right-con')]/div[@class='product-price-pdp']/span/text()" ).extract_first() p['price'] = float(self.get_price(price)[0]) p['white_price'] = float(self.get_price(price)[0]) p['raw_products'] = {} p['size_select'] = self.get_size(response) p['size_valid'] = self.get_size(response) p['desc'] = response.xpath( "/html/body/div[1]/div[2]/div[1]/div[2]/div/div[8]/div[2]/div/div/ul/li/p/text()" ).extract() p['detail'] = { 'composition': '', 'detailed': response.xpath( "//div[@class='product-selection-box']").extract_first() } cate = response.xpath( '//div[@class ="bread-crumbs"]/a[3]/text()').extract_first() p['category'] = {'name': self.get_category_by_url(cate), 'href': ''} p['img_urls'] = self.get_images(response) sizes = response.xpath( "//div[contains(@class,'product-right-con')]/div[@class='product-color']/ul/li/a/span/img" ) p['other_style'] = [] for sel in sizes: p['other_style'].append({ p['code']: { 'color': sel.attrib['title'], 'color_code': sel.attrib['title'], 'img': sel.attrib['src'] } }) return p
def parse_item(self, response): self.logger.info('Hi, your data is my data! %s', response.url) p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url) p['name'] = response.xpath("//div[@class = 'content']/div/div[1]/h1/span/text()").extract_first() p['gender'] = self.get_gender(p['name']) p['code'] = response.xpath( '//*[@id="skuCode"]/@value').extract_first() p['group_code'] = response.xpath( '//*[@id="skuCode"]/@value').extract_first() price = response.xpath( "//div[@class = 'content']/div/div[1]/div/span[2]/text()").extract_first() p['price'] = float(self.get_price(price)[0]) p['white_price'] = float(self.get_price(price)[0]) p['raw_products'] = {} styles = response.xpath( "//div[@class ='content']/div[3]/div/a/img") p['other_style'] = [] for sel in styles: p['other_style'].append({ p['code']: { 'color': sel.attrib['title'], 'color_code': sel.attrib['title'], 'img': 'https:' + sel.attrib['src']}}) p['size_select'] = { "name": response.xpath( "//div[@class = 'content']/div/div/div[2]/div/div/select/option/@status").extract_first(), "sizeCode": response.xpath( "//div[@class = 'content']/div/div/div[2]/div/div/select/option/@skusize").extract_first(), "dispalysize": ""} p['size_valid'] = { "name": response.xpath( "//div[@class = 'content']/div/div/div[2]/div/div/select/option/@status").extract_first(), "sizeCode": response.xpath( "//div[@class = 'content']/div/div/div[2]/div/div/select/option/@skusize").extract_first(), "dispalysize": ""} p['desc'] = response.xpath("//div[@class='content']/div/div/li/text()").extract_first() p['detail'] = {'composition': '', 'detailed': response.xpath( "//div[@class='content']/div").extract_first()} cate = response.xpath("//div[@class = 'content']/div/div[1]/h1/span/text()").extract_first() p['category'] = {'name': self.get_category_by_url(cate), 'href': ''} p['img_urls'] = self.get_images(response) return p
def parse_detail_data(self, response: HtmlResponse): self.logger.info('Hi, your data is my data! %s', response.url) p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url) p['name'] = response.xpath("//div[@class='container']/div/h2/text()").extract_first().strip().replace(' ', '').replace( '\n', '').replace('\t', '').replace('\r', '').strip() p['gender'] = 'female' p['code'] = self.get_code(response.url)[0] p['group_code'] = response.xpath("//div[@class = 'col-md-3 col-sm-3']/ul/li/div[2]/div/text()").extract_first() p['price'] = float(response.xpath( "//div[@class='col-md-3 col-sm-3']/ul/li/h2/@data-list-price").extract_first()) p['white_price'] = float(response.xpath( "//div[@class='col-md-3 col-sm-3']/ul/li/h2/@data-offer-price").extract_first()) p['raw_products'] = {} styles = response.xpath( "//div[@class ='col-md-3 col-sm-3']/ul/li/div/div/a/img") styless = response.xpath( "//div[@class ='col-md-3 col-sm-3']/ul/li/div/div/img") p['other_style'] = [] for sel in styles, styless: p['other_style'].append({ p['code']: { 'color': sel.attrib['title'], 'color_code': sel.attrib['title'], 'img': sel.attrib['src']}}) p['size_select'] = { "name": response.xpath("//div[@class='col-md-3 col-sm-3']/ul/li[3]/div[2]/div[1]/a/text()").extract_first(), "sizeCode": "", "dispalysize": ""} p['size_valid'] = { "name": response.xpath("//div[@class='col-md-3 col-sm-3']/ul/li[3]/div[2]/div[1]/a/text()").extract_first(), "sizeCode": "", "dispalysize": ""} p['desc'] = response.xpath( "//div[@class = 'row productdesc']/div/dl[1]/dd/text()").extract_first() p['detail'] = {'composition': '', 'detailed': response.xpath( "//div[@class='col-md-6 col-sm-6 col-xs-12']").extract_first()} p['category'] = {'name': self.WOMAN, 'href': ''} p['img_urls'] = self.get_images(response) a = self.get_size(response) return p
def parse_detail_data(self, response: HtmlResponse): rl = re.findall(r'var spConfig = new Product.Config\((.*?)\);\n', response.text) if not rl or len(rl) <= 0: return try: data = json.loads(rl[0]) except Exception as e: self.logger.exception(e) self.log_record_after(response.url, error=e) return colors = self.get_colors(data) c_name = self.get_category(response.url) # 不同颜色为一个商品 for color in colors: p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url) p['code'] = self.code_prefix + data['productId'] + '-' + color['id'] p['name'] = data['productName'] p['raw_products'] = data p['group_code'] = data['productId'] p['category'] = {'name': c_name, 'href': ''} p["price"] = float(color['price']) p['white_price'] = float(color['white_price']) p['img_urls'] = color['img_urls'] p['size_select'] = color['sizes'] p['size_valid'] = color['ava_sizes'] # p['desc'] = data['productDetail']['productDesc'] p['detail'] = { 'composition': '', 'detailed': data['shortDescription'] } p['delivery'] = '' p['tags'] = self.get_tags(response) p['other_style'] = self.get_other_style(colors, data['productId']) p['gender'] = self.get_gender(p['name'], p['tags']) yield p self.log_record_after(response.url)
def parse_detail_data(self, response: HtmlResponse): params = self.get_url_params(response.url) pid = params['id'][0] c_name = self.get_category(response.url) try: rs = json.loads(response.text) except RuntimeError as e: self.log_record_after(response.url, error=e) self.logger.error(e) if not rs['data']: return data = rs['data'] # 商品每一种颜色生成一个商品 for color in data['colors']: source_url = "https://www.gap.cn/category/"+data["rootCategoryId"]+"/product/" + \ str(pid)+".html?tag_category=" + c_name p = HMProductItem(source_url=source_url) p['code'] = self.code_prefix + str(pid) + '-' + str( color['colorsId']) p['name'] = data['productName'] p['raw_products'] = data p['group_code'] = pid p['category'] = {'name': c_name, 'href': ''} p["price"] = float(data['salePrice']) p['white_price'] = float(data['price']) p['img_urls'] = self.get_img_urls(data['imageList']) p['size_select'], p['size_valid'] = self.get_sizes(color['size']) p['desc'] = data['productDetail']['productDesc'] p['detail'] = { 'composition': '', 'detailed': data['productDetail']['productFiber'] } p['delivery'] = data['productDetail']['passMessage'] p['tags'] = self.get_tags(data) p['other_style'] = self.get_other_style(data['colors'], str(pid)) p['gender'] = self.get_gender(p['name'], p['tags']) yield p
def parse_detail_data(self, response: HtmlResponse): p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url) data = re.findall(r'var goods = (.*?)//评价修改', response.text, re.S) s = data[0].strip("'").strip().replace("'", '"') try: detail_data = json.loads( str(self.get_detail_data(s).content, 'utf-8')) except Exception as e: self.logger.exception(e) self.log_record_after(response.url, error=e) # code由ln开头 避免重复 p['code'] = 'ln-' + self.get_v('postID', s) p['name'] = self.get_v('goodsName', s) p['raw_products'] = detail_data p['group_code'] = self.get_v("product_mainID", s) p['category'] = {'name': self.get_category(response.url), 'href': ""} p["price"] = float(self.get_v("price", s)) p['img_urls'] = self.get_pics(response) p['size_select'], p['size_valid'] = self.get_sizes(detail_data) p['white_price'] = float(self.get_v("marketPrice", s)) p['desc'] = response.xpath( "//div[@id='PD_desc_basic']/pre[@class='PD_desc']/span[1]/text()" ).get() p['detail'] = { 'composition': '', 'detailed': response.xpath( "//div[@id='PD_desc_basic']/pre[@class='PD_desc']").get() } p['tags'] = self.get_tags(detail_data) p['other_style'], _ = self.get_other_style(response) p['gender'] = self.get_gender(p['name'], p['tags']) self.log_record_after(response.url) return p