def parse_designer(self, response): self.logger.info('Hi, this is designer page! %s', response.url) designer = CeremonyDesignerItem() uid = skutils.retrieve_url_param(response.url, 'designerid') name = response.xpath('//div[@class="productName"]/a/text()').extract_first() designer['uid'] = uid designer['name'] = name.strip() if name else "" designer['url'] = response.url designer['desc'] = designer['name'] designer['product_detail_urls'] = [] designer['products'] = [] designer['file_urls'] = [] designer_info = DesignerInfo(uid, designer) self.designer_info_dict[uid] = designer_info # 解析产品列表 # total = int(response.xpath('//div[@class="sortby_showall"]/a/text()').extract()[-1]) products_uri = [uri[1:] for uri in response.xpath('//div[@class="productThumb"]/a/@href').extract()] products = [{ 'uri': uri, 'uid': skutils.retrieve_url_param(uri, 'productid') } for uri in products_uri] designer['product_detail_urls'] = products_uri # designer_info.total = total products_dict = {p['uri']: p for p in products} designer_info.products.update(products_dict) return self.start_request_product_detail_page(response, designer_info)
def parse_product_detail(self, response): designer_info = self.designer_info_dict[response.meta['uid']] designer = designer_info.designer detail_url = response.meta['detail_url'] self.logger.info('parse product detail[%s] response, response status: %d', detail_url, response.status) product = CeremonyProductItem() uid = skutils.retrieve_url_param(response.url, 'productid') product_nodes = response.xpath('//div[@class="product_right_info"]') name = product_nodes.xpath('span[@class="pname"]/text()').extract_first() price = product_nodes.xpath('div[@class="productprice"]/text()').extract_first().strip() original_price = None if price == '$': price = product_nodes.xpath('div[@class="productprice"]/span[2]/text()').extract_first() original_price = '$ ' + product_nodes.xpath('div[@class="productprice"]/span[1]/text()').extract_first() size_lis = product_nodes.xpath('//ul[@class="ul_SizesColors"]/li') size_nodes = [{'attr_name': re.search('^li_(\w+) li', li.xpath('@class').extract_first()).groups()[0], 'attr_value': li.xpath('@title').extract_first(), 'product_id': li.xpath('span[@class="productid"]/text()').extract_first(), } for li in size_lis] # self.logger.debug("size_nodes: %s", str(size_nodes)) def reduce_acc(acc, size_node): product_id = size_node['product_id'] if product_id not in acc: acc[product_id] = {'product_id': product_id, 'attrs': [(size_node['attr_name'], size_node['attr_value'])]} else: acc[product_id]['attrs'].append((size_node['attr_name'], size_node['attr_value'])) return acc size_info = reduce(reduce_acc, size_nodes, {}).values() # self.logger.debug("size_info: %s", str(size_info)) desc_node = product_nodes.xpath('//div[@class="plproducttab plproductdetails"]') desc = '\n'.join(desc_node.xpath('text()').extract()).strip() desc += '\n' + desc_node.xpath('//span[@class="smallfont"]/text()').extract_first().strip() design_size = '\n'.join(product_nodes.xpath('//div[@class="plproducttab plproductdescription"]/p/text()').extract()) img_url = response.xpath('//div[@class="pili"]/img/@src').extract() product['uri'] = detail_url product['name'] = name.strip() if name else "" product['price'] = price product['original_price'] = original_price product['size_info'] = size_info product['desc'] = desc product['design_size'] = design_size product['img_url'] = [x.replace('menu_', '') for x in img_url] product['uid'] = uid designer['file_urls'].extend(product['img_url']) # for download designer['products'].append(product) return self.try_return_designer_if_last_product_detail_page(response.meta['uid'])