def crawl_collection(self): if self.is_tmall: c_url = "http://count.tbcdn.cn/counter3?callback=jsonp126&keys=ICCP_1_" + self.num_id collectionData = self.crawl_page(c_url) if collectionData: self.collection = int( get_num_val(collectionData, "ICCP_1_" + self.num_id)) else: logger.warn("Can not parse tmall item collection %s", self.item_id) else: counterApi = get_val(self.data, "counterApi") if counterApi: counterApi = get_val(self.data, "counterApi").replace(r'''\/''', "/") counterData = self.crawl_page( counterApi + "&callback=DT.mods.SKU.CountCenter.saveCounts") try: self.collection = int( get_num_val(counterData, 'ICCP_1_' + self.num_id)) self.browse = int( get_num_val(counterData, 'ICVT_7_' + self.num_id)) except: self.collection = 0 self.browse = 0
def crawl_volume(self): if self.is_tmall: apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int(get_val(self.tmallInitApi, "sellCount")) except: self.volume = 0 logger.warn("Can not parse tmall item volume %s", self.item_id) else: apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) self.confirmVolume = int(get_num_val(itemInfoData, 'confirmGoods')) except: self.volume = 0 logger.warn("Can not parse taobao item volume %s", self.item_id) else: self.volume = 0
def crawl_collection(self): if self.is_tmall: c_url = "http://count.tbcdn.cn/counter3?callback=jsonp126&keys=ICCP_1_" + self.num_id collectionData = self.crawl_page(c_url) if collectionData: self.collection = int(get_num_val(collectionData, "ICCP_1_" + self.num_id)) else: logger.warn("Can not parse tmall item collection %s", self.item_id) else: counterApi = get_val(self.data, "counterApi") if counterApi: counterApi = get_val(self.data, "counterApi").replace(r'''\/''', "/") counterData = self.crawl_page(counterApi + "&callback=DT.mods.SKU.CountCenter.saveCounts") try: self.collection = int(get_num_val(counterData, 'ICCP_1_' + self.num_id)) self.browse = int(get_num_val(counterData, 'ICVT_7_' + self.num_id)) except: self.collection = 0 self.browse = 0
def crawl_volume(self): if self.is_tmall: apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads( self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel'][ 'sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int(get_val(self.tmallInitApi, "sellCount")) except: self.volume = 0 logger.warn("Can not parse tmall item volume %s", self.item_id) else: apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace( r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) self.confirmVolume = int( get_num_val(itemInfoData, 'confirmGoods')) except: self.volume = 0 logger.warn("Can not parse taobao item volume %s", self.item_id) else: self.volume = 0
def crawl_stock(self): if self.is_tmall: try: self.stock = int(get_num_val(self.tmallInitApi, "icTotalQuantity")) except: logger.warn("Can not parse tmall item stock %s", self.item_id) else: try: if self.dynamicStockData: self.stock = int(get_val(self.dynamicStockData, "stock").strip()) else: logger.warn("Can not parse taobao item stock %s", self.item_id) except: logger.error("Can not parse tmall item stock %s", self.item_id)
def crawl_stock(self): if self.is_tmall: try: self.stock = int( get_num_val(self.tmallInitApi, "icTotalQuantity")) except: logger.warn("Can not parse tmall item stock %s", self.item_id) else: try: if self.dynamicStockData: self.stock = int( get_val(self.dynamicStockData, "stock").strip()) else: logger.warn("Can not parse taobao item stock %s", self.item_id) except: logger.error("Can not parse tmall item stock %s", self.item_id)
def crawl_price(self): self.bidPrice = self.html_obj.xpath( "//input[@name='current_price']/@value") self.originPrice = self.html_obj.xpath( "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()") if not self.originPrice: self.originPrice = self.html_obj.xpath( "//strong[@class='J_originalPrice']/text()") self.promoteUrl2 = get_val(self.data, "apiPromoData") if self.promoteUrl2: self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/") price = "" if self.is_tmall and self.tmallInitApi and self.tmallInitApijson: try: priceInfo = self.tmallInitApijson['defaultModel'][ 'itemPriceResultDO']['priceInfo'] if priceInfo: if priceInfo.has_key('def'): defaultPriceInfo = priceInfo['def'] else: defaultPriceInfo = priceInfo[priceInfo.keys()[0]] # 2013-11-22 改为获取真实促销价格,而不是扣除佣金后的价格 if defaultPriceInfo.has_key( 'promotionList' ) and defaultPriceInfo['promotionList']: price = defaultPriceInfo['promotionList'][0]['price'] if not price: if defaultPriceInfo.has_key('price'): price = defaultPriceInfo['price'] if not price: if defaultPriceInfo.has_key('promPrice'): price = defaultPriceInfo['promPrice']['price'] elif defaultPriceInfo.has_key( 'promotionList' ) and defaultPriceInfo['promotionList']: price = str( min([ float(x.get('price', '100000000.0')) for x in defaultPriceInfo['promotionList'] ])) except: logger.warn("Parse tmall json price failed, %s", self.item_id) if not price: if self.promoteUrl2: self.promoteContent = self.crawl_page( self.promoteUrl2).replace('"', '"') tag = "low:" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find(',', pos) price = self.promoteContent[pos:pos2] if not price: price = get_num_val(self.promoteContent, 'price') else: self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id self.promoteContent = self.crawl_page(self.promoteUrl) if self.promoteContent: self.promoteContent = self.promoteContent.replace( '"', '"') tag = "promPrice":"" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find('"', pos) price = self.promoteContent[pos:pos2] if not price: tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()") tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()") if tbPrice and not tbPrice[0].strip(): price = tbPrice[0].strip() elif tbPrice1 and not tbPrice1[0].strip(): price = tbPrice1[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() if not price: rg_m = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE | re.DOTALL).search(self.dynamicStockData) if rg_m: price_str = rg_m.group(0).split(":")[1].replace("\"", "") price = Decimal(price_str) # 2013-09-03 get price url if not price: #这里稍微有点麻烦,主要针对string进行处理 pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id response = download(pirce_url, self.headers) rg = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE | re.DOTALL) m = rg.search(response.decode('gb18030').encode('utf8')) if m: price_str = m.group(0).split(":")[1].replace("\"", "") price = Decimal(price_str) # not chuxiao price, set origin price if not price: if self.originPrice: price = self.originPrice[0].strip() elif self.bidPrice: price = self.bidPrice[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() self.price = float(price) logger.debug("%s price is %s", self.item_id, self.price)
def crawl(self): try: self.data = self.crawl_page(self.url) if FLAGS.debug_parser: import pdb pdb.set_trace() # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find( 'item.taobao.com' ) > 0 and self.data.find( "window.location.href='http://detail.tmall.com/item.htm'+window.location.search" ) > 0: self.data = self.crawl_page( self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True self.detailDiv = self.html_obj.xpath("//div[@id='detail']") self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']") self.originPrice = self.html_obj.xpath( "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()") if not self.originPrice: self.originPrice = self.html_obj.xpath( "//strong[@class='J_originalPrice']/text()") #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()") self.bidPrice = self.html_obj.xpath( "//input[@name='current_price']/@value") self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [ IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style") ] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not self.is_tmall and tmalllogo: self.is_tmall = True if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads( self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel'][ 'sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int( get_val(self.tmallInitApi, "sellCount")) except: logger.warn("Can not parse item volume %s", self.item_id) # 库存 :icTotalQuantity """" reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) m = RATECOUNT_RE.match(reviewInfoData) if m: self.reviewCount = m.group(1) else: self.reviewCount = None """ else: self.cid = get_val(self.data, "cid") apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace( r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) except: self.volume = -1 else: self.volume = -1 #interval = get_val(data2, 'interval') # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/") """ reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) self.reviewCount = get_val(reviewInfoData, 'total') """ except: logger.error("crawling %s unknown exception %s", self.item_id, traceback.format_exc(), extra={'tags': [ 'crawlItemException', ]}) raise
def crawl(self): try: self.data = self.crawl_page(self.url) if FLAGS.debug_parser: import pdb; pdb.set_trace() # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0: self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True self.detailDiv = self.html_obj.xpath("//div[@id='detail']") self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']") self.originPrice = self.html_obj.xpath("//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()") if not self.originPrice: self.originPrice = self.html_obj.xpath("//strong[@class='J_originalPrice']/text()") #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()") self.bidPrice = self.html_obj.xpath("//input[@name='current_price']/@value") self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not self.is_tmall and tmalllogo: self.is_tmall = True if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int(get_val(self.tmallInitApi, "sellCount")) except: logger.warn("Can not parse item volume %s", self.item_id) # 库存 :icTotalQuantity """" reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) m = RATECOUNT_RE.match(reviewInfoData) if m: self.reviewCount = m.group(1) else: self.reviewCount = None """ else: self.cid = get_val(self.data, "cid") apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) except: self.volume = -1 else: self.volume = -1 #interval = get_val(data2, 'interval') # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/") """ reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) self.reviewCount = get_val(reviewInfoData, 'total') """ except: logger.error("crawling %s unknown exception %s", self.item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]}) raise
def crawl_price(self): self.promoteUrl2 = get_val(self.data, "apiPromoData") if self.promoteUrl2: self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/") price = "" if self.is_tmall and self.tmallInitApi and self.tmallInitApijson: try: priceInfo = self.tmallInitApijson['defaultModel']['itemPriceResultDO']['priceInfo'] if priceInfo: if priceInfo.has_key('def'): defaultPriceInfo = priceInfo['def'] else: defaultPriceInfo = priceInfo[priceInfo.keys()[0]] if defaultPriceInfo.has_key('promPrice'): price = defaultPriceInfo['promPrice']['price'] elif defaultPriceInfo.has_key('promotionList') and defaultPriceInfo['promotionList']: price = str(min([float(x.get('price','100000000.0')) for x in defaultPriceInfo['promotionList']])) else: price = defaultPriceInfo['price'] except: logger.warn("Parse tmall json price failed, %s", self.item_id) if not price: if self.promoteUrl2: self.promoteContent = self.crawl_page(self.promoteUrl2).replace('"', '"') tag = "low:" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find(',', pos) price = self.promoteContent[pos:pos2] if not price: price = get_num_val(self.promoteContent, 'price') else: self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id self.promoteContent = self.crawl_page(self.promoteUrl).replace('"', '"') tag = "promPrice":"" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find('"', pos) price = self.promoteContent[pos:pos2] if not price: tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()") tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()") if tbPrice and not tbPrice[0].strip(): price = tbPrice[0].strip() elif tbPrice1 and not tbPrice1[0].strip(): price = tbPrice1[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() # 2013-09-03 get price url if not price: #这里稍微有点麻烦,主要针对string进行处理 pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id response = download(pirce_url, self.headers) rg = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE|re.DOTALL) m = rg.search(response.decode('gb18030').encode('utf8')) if m: price_str = m.group(0).split(":")[1].replace("\"", "") price = Decimal(price_str) # not chuxiao price, set origin price if not price: if self.originPrice: price = self.originPrice[0].strip() elif self.bidPrice: price = self.bidPrice[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() self.price = float(price) logger.debug("%s price is %s", self.item_id, self.price)