예제 #1
0
 def add_product(self, item: HMProductItem, spider: scrapy.Spider):
     if Product.objects.filter(code=item['code']).exists():
         spider.logger.info(f'exist product: {item["code"]}')
         return item
     item['brand'] = self.get_brand(spider.name)
     if 'category' in item:
         item['category'], _ = Category.objects.get_or_create(**item['category'])
     p = item.save()
     for tag_info in item.get('tags', []):
         tag, _ = Tag.objects.get_or_create(**tag_info)
         p.tags.add(tag)
     return item
예제 #2
0
 def parse_detail_data(self, response: HtmlResponse):
     params = self.get_url_params(response.url)
     detail_data = self.get_detail_data(response)
     # 每种样式/颜色生成一个商品
     sts = self.get_other_style(response)
     if not sts:
         return
     for code in sts.keys():
         p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url)
         color_data = self.get_color_data(code, response)
         p['code'] = code
         p['name'] = detail_data['name']
         gcl = params.get('ProductID', None)
         if not gcl:
             gcl = params.get('productid', None)
         p['group_code'] = gcl[0]
         p['category'] = detail_data['category']
         p["price"] = float(color_data['price'])
         p['white_price'] = float(color_data['white_price'])
         p['img_urls'] = color_data['imgs']
         p['size_select'] = color_data['sizes']
         p['size_valid'] = color_data['ava_sizes']
         p['desc'] = detail_data['desc']
         p['detail'] = detail_data['detail']
         p['delivery'] = detail_data['delivery']
         p['tags'] = detail_data['tags']
         p['other_style'] = self.get_other_style(response)
         p['raw_products'] = {}
         p['gender'] = self.get_gender(p['name'], p['tags'])
         yield p
예제 #3
0
 def parse_detail_data(self, response: HtmlResponse):
     data = json.loads(str(response.text)).get('data', None)
     if not data:
         return
     for color in data['color']:
         if color['status'] == 'OutShelf':
             continue
         source_url = "https://www.only.cn/goodsDetails.html?design="+data['projectCode']
         p = HMProductItem(source_url=source_url)
         p['code'] = self.code_prefix + color['colorCode']
         p['name'] = data['goodsName']
         p['raw_products'] = data
         p['group_code'] = data['projectCode']
         # only站只有女士服饰
         p['category'] = {'name': self.WOMAN, 'href': ''}
         p["price"] = float(color['price'])
         p['white_price'] = float(color['originalPrice'])
         p['img_urls'] = self.get_img_urls(color)
         p['size_select'], p['size_valid'] = self.get_sizes(color, data['projectCode'])
         p['desc'] = data['describe']
         p['detail'] = {'composition': '', 'detailed': data['goodsInfo']}
         p['delivery'] = ''
         p['tags'] = self.get_tags(color)
         p['other_style'] = self.get_other_style(data['color'])
         # only站只有女性服饰
         p['gender'] = 'female'
         yield p
     self.log_record_after(response.url)
예제 #4
0
    def parse_detail_data(self, response: HtmlResponse):
        items = response.xpath('//div[@class="detail-row-r"]/div[@class="detail-list"]')
        p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url)
        cate = response.xpath('/html/head/title').extract_first()
        p['category'] = {'name': self.get_category(cate), 'href': ''}
        p['name'] = items.xpath('h5/text()').extract_first().strip()
        p['code'] = re.findall(r'product/detail/(\S+).html', response.url)[0]
        p['group_code'] = items[0].xpath('p/text()').extract_first().split(':')[-1]
        p['desc'] = items[1].xpath('p/text()').extract_first()
        p['detail'] = {'composition': items[2].xpath('p/text()').extract_first(), 'detailed': ''}
        p['raw_products'] = {}
        price = items[3].xpath('h1/text()').extract_first()
        if price:
            white_price = items[3].xpath('h3/del/text()').extract_first()
        else:
            white_price = items[3].xpath('h3/text()').extract_first()
            price = white_price
        p['price'] = float(re.search(r'\d+', price).group())
        p['white_price'] = float(re.search(r'\d+', white_price).group())
        p['img_urls'] = self.get_img_urls(response)
        p['other_style'] = self.get_other_style(items[4])
        p['size_select'], p['size_valid'] = self.get_size_data(items[5], p['other_style'][p['code']]['color_code'])

        p['gender'] = self.get_gender(p['name'])
        return p
예제 #5
0
    def parse_detail_data(self, response: HtmlResponse, code: str):
        data = self.get_spu_json(code)
        summary = data.get('summary', {})
        rows = {}
        other_style = {}
        code = summary.get('productCode')
        for row in data.get('rows', []):
            let_code = f"{code}-{row.get('colorNo')}"
            if let_code not in rows:
                rows[let_code] = []
                img = f'https://www.uniqlo.cn/hmall/test/{code}/sku/40/{row["colorNo"]}.jpg'
                other_style[let_code] = {
                    'color': row.get('style'),
                    'img': img,
                    'color_code': row.get('colorNo')
                }
            rows[let_code].append(row)
        stock = self.get_stock_json(response.url, code)
        price_data = self.get_price_json(code)
        img_data = self.get_img_json(code)

        price_dict = {row['productId']: row for row in price_data['rows']}
        ps = []  # 结果集
        for let_code, row in rows.items():
            product_id = row[0]['productId']
            white_price = float(summary.get('originPrice', 0))
            p = HMProductItem(
                html=response.text.encode('utf8', 'ignore'),
                source_url=response.url,
                name=summary.get('name'),
                code=let_code,
                group_code=code,
                raw_products=data,
                white_price=white_price,
                other_style=other_style,
                price=float(
                    price_dict.get(product_id,
                                   {'price': white_price})['price']))
            tags = [summary.get('gDeptValue'), row[0].get('style')]
            p['tags'] = [{'name': tag, 'href': ''} for tag in tags if tag]
            p['category'] = {
                'name': self.get_category(response.url),
                'href': ''
            }
            p['gender'] = self.get_gender(summary.get('name'), p['tags'])
            p['img_urls'] = self.get_images(img_data, row)
            p['size_select'], p['size_valid'] = self.get_size_data(row, stock)

            instruction = data.get('desc', {}).get('instruction', '')
            p['desc'] = BeautifulSoup(instruction).get_text()
            p['detail'] = {'composition': '', 'detailed': instruction}
            ps.append(p)
        self.log_record_after(response.url)
        return ps
예제 #6
0
    def parse_detail_data(self, response: HtmlResponse):
        inputs = response.xpath('//div[@class="row float-clearfix"]')
        p = HMProductItem(html=response.text.encode('utf8', 'ignore'),
                          source_url=response.url)
        title_select = response.selector.xpath(
            '//div[@class="pdp-title none-sm"]')
        cate = response.xpath('/html/head/title').extract_first()
        # if tags and tags[0]:
        p['category'] = {
            'name': self.get_category_by_url(cate),
            'href': ''
        }  # {'name': tags[0], 'href': ''}
        if title_select:
            title_select.xpath('div[@class="goods-tit"]')
            tags = title_select.xpath('div[@class="goods-tit"]/text()'
                                      ).extract_first().strip().split()
            p['tags'] = [{'name': tag, 'href': ''} for tag in tags if tag]
            p['name'] = inputs.xpath(
                'input[@id="itemTitle"]/@value').extract_first()
            p['gender'] = self.get_gender(p['name'], p['tags'])
            p['code'] = inputs.xpath(
                'input[@id="itemCode"]/@value').extract_first()
            p['group_code'] = inputs.xpath(
                'input[@id="itemStyle"]/@value').extract_first()
            p['other_style'] = {
                li.attrib['code']: {
                    'color': '',
                    'color_code': li.attrib['itemstyle'],
                    'img': li.xpath('a/img/@src').extract_first()
                }
                for li in response.xpath('//ul[@id="itemColor"]/li')
            }
            p['raw_products'] = {}

            p['img_urls'] = self.get_images(response)
            p['size_select'], p['size_valid'] = self.get_size_data(response)

            p['white_price'] = float(
                inputs.xpath('input[@id="listPrice"]/@value').extract_first())
            p['price'] = float(
                inputs.xpath('input[@id="salePrice"]/@value').extract_first())

            p['desc'] = response.xpath(
                '//div[@class="large-box1"]/div/div[@class="float-left"]/p/text()'
            ).extract_first()
            p['detail'] = {
                'composition':
                '',
                'detailed':
                response.xpath('//div[@class="large-box1"]').extract_first()
            }

            return p
예제 #7
0
    def parse_detail_data(self, response: HtmlResponse):
        tags = []
        category = None
        i = 0
        for tag in response.xpath(
                "//a[@itemprop='item']/span/text()").extract():
            tag_info = {'name': tag, 'href': ''}
            if i == 1:
                category = tag_info
            i += 1
            tags.append(tag_info)
        for tag in response.xpath("//title/text()").extract_first().split():
            name = tag.strip()
            if name in ['-', '|', 'CN']:
                continue
            tags.append({'name': name, 'href': ''})
        try:
            html = response.xpath('//main').get()
        except Exception as e:
            self.logger.exception(e)
            html = response.text
        p = HMProductItem(html=html, source_url=response.url)
        p['tags'] = tags

        p['category'] = {'name': self.get_category(response.url), 'href': ''}
        p['name'] = response.xpath(
            "//h1[@class='primary product-item-headline']/text()"
        ).extract_first().strip()

        p['gender'] = self.get_gender(p['name'], p['tags'])
        data = self.parse_product_data(response)
        if not data:
            return
        p['raw_products'] = data
        p['other_style'] = self.get_other_style(data)
        code = data['articleCode']
        p['code'] = code
        p['group_code'] = code[:-3]
        p['img_urls'] = data[code]['images']
        p['size_select'] = data[code]['sizes']
        p['size_valid'] = self.hm_request_size_valid(response.url, code)
        white_price = data[code].get('whitePriceValue', '0')
        p['white_price'] = float(white_price)
        p['price'] = float(data[code].get('redPriceValue', white_price))
        p['desc'] = data[code].get('description', '')
        p['detail'] = {
            'composition': data[code].get('composition'),
            'detailed': data[code].get('detailedDescriptions')
        }
        return p
예제 #8
0
파일: nike.py 프로젝트: luoscorn/hm_spider
    def parse_detail_data(self, response: HtmlResponse):
        data = re.findall(r'window.INITIAL_REDUX_STATE=(.*?);</script>',
                          response.text, re.S)
        if data and data[0]:
            try:
                product_data = json.loads(data[0])
                self.log_record_after(response.url)
            except json.decoder.JSONDecodeError:
                self.log_record_after(response.url, info=data[0], error='json')
                return
            products = product_data.get('Threads', {}).get('products', {})
            other_style = {
                k: {
                    'color': v.get('colorDescription'),
                    'img': v.get('firstImageUrl'),
                    'color_code': k.split('-')[-1]
                }
                for k, v in products.items()
            }
            items = []
            for k, v in products.items():
                p = HMProductItem(html=response.text.encode('utf8', 'ignore'),
                                  source_url=response.url)
                p['name'] = v.get('fullTitle', '')
                p['code'] = k
                p['group_code'] = v.get('styleCode', '')
                p['raw_products'] = v
                p['other_style'] = other_style
                p['category'] = {
                    'name': self.get_category(response.url),
                    'href': ''
                }
                p['gender'] = self.get_gender(v.get('fullTitle', ''), [],
                                              response.url)

                p['img_urls'] = self.get_images(v['nodes'][0]['nodes'])
                p['size_select'] = self.get_size_data(v['skus'])
                p['size_valid'] = self.get_valid_size_data(v['availableSkus'])

                white_price = v.get('fullPrice', 0)
                p['white_price'] = float(white_price)
                p['price'] = float(v.get('currentPrice', white_price))

                p['desc'] = v.get('descriptionPreview', '')
                p['detail'] = {'composition': '', 'detailed': v['description']}
                items.append(p)
            return items
예제 #9
0
파일: ck.py 프로젝트: luoscorn/hm_spider
    def parse_item(self, response):
        self.logger.info('Hi, this is an item page! %s', response.url)
        p = HMProductItem(html=response.text.encode('utf8', 'ignore'),
                          source_url=response.url)
        p['name'] = response.xpath('/html/head/title/text()').extract_first()
        p['gender'] = self.get_gender(p['name'])
        p['code'] = self.get_code(response.url)[0]
        p['group_code'] = self.get_code(response.url)[0]
        price = response.xpath(
            "//div[contains(@class,'product-right-con')]/div[@class='product-price-pdp']/span/text()"
        ).extract_first()
        p['price'] = float(self.get_price(price)[0])
        p['white_price'] = float(self.get_price(price)[0])
        p['raw_products'] = {}
        p['size_select'] = self.get_size(response)
        p['size_valid'] = self.get_size(response)
        p['desc'] = response.xpath(
            "/html/body/div[1]/div[2]/div[1]/div[2]/div/div[8]/div[2]/div/div/ul/li/p/text()"
        ).extract()
        p['detail'] = {
            'composition':
            '',
            'detailed':
            response.xpath(
                "//div[@class='product-selection-box']").extract_first()
        }
        cate = response.xpath(
            '//div[@class ="bread-crumbs"]/a[3]/text()').extract_first()
        p['category'] = {'name': self.get_category_by_url(cate), 'href': ''}
        p['img_urls'] = self.get_images(response)
        sizes = response.xpath(
            "//div[contains(@class,'product-right-con')]/div[@class='product-color']/ul/li/a/span/img"
        )
        p['other_style'] = []
        for sel in sizes:
            p['other_style'].append({
                p['code']: {
                    'color': sel.attrib['title'],
                    'color_code': sel.attrib['title'],
                    'img': sel.attrib['src']
                }
            })

        return p
예제 #10
0
    def parse_item(self, response):
        self.logger.info('Hi, your data is my data! %s', response.url)
        p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url)
        p['name'] = response.xpath("//div[@class = 'content']/div/div[1]/h1/span/text()").extract_first()
        p['gender'] = self.get_gender(p['name'])
        p['code'] = response.xpath(
            '//*[@id="skuCode"]/@value').extract_first()
        p['group_code'] = response.xpath(
            '//*[@id="skuCode"]/@value').extract_first()
        price = response.xpath(
            "//div[@class = 'content']/div/div[1]/div/span[2]/text()").extract_first()
        p['price'] = float(self.get_price(price)[0])
        p['white_price'] = float(self.get_price(price)[0])
        p['raw_products'] = {}
        styles = response.xpath(
            "//div[@class ='content']/div[3]/div/a/img")
        p['other_style'] = []
        for sel in styles:
            p['other_style'].append({
                p['code']: {
                    'color': sel.attrib['title'], 'color_code': sel.attrib['title'],
                    'img': 'https:' + sel.attrib['src']}})
        p['size_select'] = {
            "name": response.xpath(
                "//div[@class = 'content']/div/div/div[2]/div/div/select/option/@status").extract_first(),
            "sizeCode": response.xpath(
                "//div[@class = 'content']/div/div/div[2]/div/div/select/option/@skusize").extract_first(),
            "dispalysize": ""}
        p['size_valid'] = {
            "name": response.xpath(
                "//div[@class = 'content']/div/div/div[2]/div/div/select/option/@status").extract_first(),
            "sizeCode": response.xpath(
                "//div[@class = 'content']/div/div/div[2]/div/div/select/option/@skusize").extract_first(),
            "dispalysize": ""}
        p['desc'] = response.xpath("//div[@class='content']/div/div/li/text()").extract_first()
        p['detail'] = {'composition': '', 'detailed': response.xpath(
            "//div[@class='content']/div").extract_first()}
        cate = response.xpath("//div[@class = 'content']/div/div[1]/h1/span/text()").extract_first()
        p['category'] = {'name': self.get_category_by_url(cate), 'href': ''}
        p['img_urls'] = self.get_images(response)

        return p
예제 #11
0
    def parse_detail_data(self, response: HtmlResponse):
        self.logger.info('Hi, your data is my data! %s', response.url)
        p = HMProductItem(html=response.text.encode('utf8', 'ignore'), source_url=response.url)
        p['name'] = response.xpath("//div[@class='container']/div/h2/text()").extract_first().strip().replace(' ',
                                                                                                              '').replace(
            '\n', '').replace('\t', '').replace('\r', '').strip()
        p['gender'] = 'female'
        p['code'] = self.get_code(response.url)[0]
        p['group_code'] = response.xpath("//div[@class = 'col-md-3 col-sm-3']/ul/li/div[2]/div/text()").extract_first()
        p['price'] = float(response.xpath(
            "//div[@class='col-md-3 col-sm-3']/ul/li/h2/@data-list-price").extract_first())
        p['white_price'] = float(response.xpath(
            "//div[@class='col-md-3 col-sm-3']/ul/li/h2/@data-offer-price").extract_first())
        p['raw_products'] = {}
        styles = response.xpath(
            "//div[@class ='col-md-3 col-sm-3']/ul/li/div/div/a/img")
        styless = response.xpath(
            "//div[@class ='col-md-3 col-sm-3']/ul/li/div/div/img")
        p['other_style'] = []
        for sel in styles, styless:
            p['other_style'].append({
                p['code']: {
                    'color': sel.attrib['title'], 'color_code': sel.attrib['title'],
                    'img': sel.attrib['src']}})
        p['size_select'] = {
            "name": response.xpath("//div[@class='col-md-3 col-sm-3']/ul/li[3]/div[2]/div[1]/a/text()").extract_first(),
            "sizeCode": "", "dispalysize": ""}
        p['size_valid'] = {
            "name": response.xpath("//div[@class='col-md-3 col-sm-3']/ul/li[3]/div[2]/div[1]/a/text()").extract_first(),
            "sizeCode": "", "dispalysize": ""}
        p['desc'] = response.xpath(
            "//div[@class = 'row productdesc']/div/dl[1]/dd/text()").extract_first()
        p['detail'] = {'composition': '', 'detailed': response.xpath(
            "//div[@class='col-md-6 col-sm-6 col-xs-12']").extract_first()}
        p['category'] = {'name': self.WOMAN, 'href': ''}
        p['img_urls'] = self.get_images(response)
        a = self.get_size(response)

        return p
예제 #12
0
파일: canda.py 프로젝트: luoscorn/hm_spider
 def parse_detail_data(self, response: HtmlResponse):
     rl = re.findall(r'var spConfig = new Product.Config\((.*?)\);\n',
                     response.text)
     if not rl or len(rl) <= 0:
         return
     try:
         data = json.loads(rl[0])
     except Exception as e:
         self.logger.exception(e)
         self.log_record_after(response.url, error=e)
         return
     colors = self.get_colors(data)
     c_name = self.get_category(response.url)
     # 不同颜色为一个商品
     for color in colors:
         p = HMProductItem(html=response.text.encode('utf8', 'ignore'),
                           source_url=response.url)
         p['code'] = self.code_prefix + data['productId'] + '-' + color['id']
         p['name'] = data['productName']
         p['raw_products'] = data
         p['group_code'] = data['productId']
         p['category'] = {'name': c_name, 'href': ''}
         p["price"] = float(color['price'])
         p['white_price'] = float(color['white_price'])
         p['img_urls'] = color['img_urls']
         p['size_select'] = color['sizes']
         p['size_valid'] = color['ava_sizes']
         # p['desc'] = data['productDetail']['productDesc']
         p['detail'] = {
             'composition': '',
             'detailed': data['shortDescription']
         }
         p['delivery'] = ''
         p['tags'] = self.get_tags(response)
         p['other_style'] = self.get_other_style(colors, data['productId'])
         p['gender'] = self.get_gender(p['name'], p['tags'])
         yield p
     self.log_record_after(response.url)
예제 #13
0
 def parse_detail_data(self, response: HtmlResponse):
     params = self.get_url_params(response.url)
     pid = params['id'][0]
     c_name = self.get_category(response.url)
     try:
         rs = json.loads(response.text)
     except RuntimeError as e:
         self.log_record_after(response.url, error=e)
         self.logger.error(e)
     if not rs['data']:
         return
     data = rs['data']
     # 商品每一种颜色生成一个商品
     for color in data['colors']:
         source_url = "https://www.gap.cn/category/"+data["rootCategoryId"]+"/product/" + \
                      str(pid)+".html?tag_category=" + c_name
         p = HMProductItem(source_url=source_url)
         p['code'] = self.code_prefix + str(pid) + '-' + str(
             color['colorsId'])
         p['name'] = data['productName']
         p['raw_products'] = data
         p['group_code'] = pid
         p['category'] = {'name': c_name, 'href': ''}
         p["price"] = float(data['salePrice'])
         p['white_price'] = float(data['price'])
         p['img_urls'] = self.get_img_urls(data['imageList'])
         p['size_select'], p['size_valid'] = self.get_sizes(color['size'])
         p['desc'] = data['productDetail']['productDesc']
         p['detail'] = {
             'composition': '',
             'detailed': data['productDetail']['productFiber']
         }
         p['delivery'] = data['productDetail']['passMessage']
         p['tags'] = self.get_tags(data)
         p['other_style'] = self.get_other_style(data['colors'], str(pid))
         p['gender'] = self.get_gender(p['name'], p['tags'])
         yield p
예제 #14
0
 def parse_detail_data(self, response: HtmlResponse):
     p = HMProductItem(html=response.text.encode('utf8', 'ignore'),
                       source_url=response.url)
     data = re.findall(r'var goods = (.*?)//评价修改', response.text, re.S)
     s = data[0].strip("'").strip().replace("'", '"')
     try:
         detail_data = json.loads(
             str(self.get_detail_data(s).content, 'utf-8'))
     except Exception as e:
         self.logger.exception(e)
         self.log_record_after(response.url, error=e)
     # code由ln开头 避免重复
     p['code'] = 'ln-' + self.get_v('postID', s)
     p['name'] = self.get_v('goodsName', s)
     p['raw_products'] = detail_data
     p['group_code'] = self.get_v("product_mainID", s)
     p['category'] = {'name': self.get_category(response.url), 'href': ""}
     p["price"] = float(self.get_v("price", s))
     p['img_urls'] = self.get_pics(response)
     p['size_select'], p['size_valid'] = self.get_sizes(detail_data)
     p['white_price'] = float(self.get_v("marketPrice", s))
     p['desc'] = response.xpath(
         "//div[@id='PD_desc_basic']/pre[@class='PD_desc']/span[1]/text()"
     ).get()
     p['detail'] = {
         'composition':
         '',
         'detailed':
         response.xpath(
             "//div[@id='PD_desc_basic']/pre[@class='PD_desc']").get()
     }
     p['tags'] = self.get_tags(detail_data)
     p['other_style'], _ = self.get_other_style(response)
     p['gender'] = self.get_gender(p['name'], p['tags'])
     self.log_record_after(response.url)
     return p