def crawl_volume(self): if self.is_tmall: apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int(get_val(self.tmallInitApi, "sellCount")) except: self.volume = 0 logger.warn("Can not parse tmall item volume %s", self.item_id) else: apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) self.confirmVolume = int(get_num_val(itemInfoData, 'confirmGoods')) except: self.volume = 0 logger.warn("Can not parse taobao item volume %s", self.item_id) else: self.volume = 0
def crawler(sql): db = get_db_engine() items = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for item in items: shop_id = item[0] shop_type = item[1] item_id = item[2] url = item[3] try: htm = get_item_htm(item_id, url, db) if shop_type == 1: htm_obj = parse_html(htm, encoding='gb18030') discount_url = htm_obj.xpath("//div[@id='promote']/@data-default") if discount_url and len(discount_url) > 0: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(discount_url[0], item_headers) if disc_content.strip(): disc_obj = parse_html(disc_content, encoding='gb18030') content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip() dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip() st = dates.encode('utf-8').replace("--","—").split("—") start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0]) logger.info("taobao shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) elif shop_type == 2: d_url = get_val(htm, "initApi") if d_url: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(d_url, item_headers) cjson = loads(disc_content.decode('gb18030').encode('utf8')) shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm'] if shop_prom: st = int(shop_prom['startTime'])/1000 et = int(shop_prom['endTime'])/1000 start_time = time.strftime("%Y-%m-%d", time.localtime(st)) end_time = time.strftime("%Y-%m-%d", time.localtime(et)) content = shop_prom['promPlan'][0]['msg'] db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, d_url) logger.info("tmall shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) except: logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
def crawl_tmall_rate_page(self, url, page): if url: url = CURPAGE_RE.subn(r"\g<1>%s\g<3>" % page, url)[0] rate1 = self.crawl_page(url) rate1 = "{" + rate1 + "}" if rate1: jsonobj = loads(rate1.decode('gb18030').encode('utf8')) return jsonobj return None
def crawl_taobao_rate_page(self, rateListUrlBase, page): if rateListUrlBase: rateListUrl = rateListUrlBase + '¤tPageNum=%s&rateType=&orderType=feedbackdate&showContent=1&attribute=&callback=jsonp_reviews_list' % page rate1 = self.crawl_page(rateListUrl) m = JSON_RE.match(rate1) if m: jsonobj = loads(m.group(1).decode('gb18030').encode('utf8')) return jsonobj return None
def crawl(): company_id = 19 url = "https://efinance.cmbchinaucs.com/Handler/ActionPage.aspx?targetAction=GetProjectList_Index" headers = { 'Host': "efinance.cmbchinaucs.com", 'Connection': "keep-alive", 'Content-Length': "33", 'Cache-Control': "max-age=0", 'Accept': "text/plain, */*", 'Origin': "https://efinance.cmbchinaucs.com", 'X-Requested-With': "XMLHttpRequest", 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36", 'Content-Type': "application/x-www-form-urlencoded", 'Referer': "https://efinance.cmbchinaucs.com/", 'Accept-Encoding': "gzip,deflate", 'Accept-Language': "zh-CN,zh;q=0.8,en;q=0.6", 'Cookie': "ASP.NET_SessionId=woqbxpemqp3kk4syvfbkxtzw" } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = post(url, data={"targetAction": "GetProjectList_Index"}, headers=headers) loans_json = loads(loan_htm, encoding="UTF-8") print loans_json except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://efinance.cmbchinaucs.com/Handler/ActionPage.aspx?targetAction=GetProjectList_Index" headers = {'Host': "efinance.cmbchinaucs.com", 'Connection': "keep-alive", 'Content-Length': "33", 'Cache-Control': "max-age=0", 'Accept': "text/plain, */*", 'Origin': "https://efinance.cmbchinaucs.com", 'X-Requested-With': "XMLHttpRequest", 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36", 'Content-Type': "application/x-www-form-urlencoded", 'Referer': "https://efinance.cmbchinaucs.com/", 'Accept-Encoding': "gzip,deflate", 'Accept-Language': "zh-CN,zh;q=0.8,en;q=0.6", 'Cookie': "ASP.NET_SessionId=woqbxpemqp3kk4syvfbkxtzw"} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = post(url, data={"targetAction": "GetProjectList_Index"}, headers=headers) loans_json = loads(loan_htm, encoding="UTF-8") print loans_json except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl_volume(self): if self.is_tmall: apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads( self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel'][ 'sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int(get_val(self.tmallInitApi, "sellCount")) except: self.volume = 0 logger.warn("Can not parse tmall item volume %s", self.item_id) else: apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace( r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) self.confirmVolume = int( get_num_val(itemInfoData, 'confirmGoods')) except: self.volume = 0 logger.warn("Can not parse taobao item volume %s", self.item_id) else: self.volume = 0
def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = { 'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i][ "productNameDisplay"] loan_obj.rate = str( float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i] ["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i][ "collectionModeDisplay"] loan_obj.borrow_amount = str( int(loans_json["data"][i]["price"])) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = { 'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str( int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = {'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str(int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i]["productNameDisplay"] loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"] loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"])) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(self): try: self.data = self.crawl_page(self.url) if FLAGS.debug_parser: import pdb pdb.set_trace() # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find( 'item.taobao.com' ) > 0 and self.data.find( "window.location.href='http://detail.tmall.com/item.htm'+window.location.search" ) > 0: self.data = self.crawl_page( self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True self.detailDiv = self.html_obj.xpath("//div[@id='detail']") self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']") self.originPrice = self.html_obj.xpath( "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()") if not self.originPrice: self.originPrice = self.html_obj.xpath( "//strong[@class='J_originalPrice']/text()") #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()") self.bidPrice = self.html_obj.xpath( "//input[@name='current_price']/@value") self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [ IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style") ] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not self.is_tmall and tmalllogo: self.is_tmall = True if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads( self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel'][ 'sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int( get_val(self.tmallInitApi, "sellCount")) except: logger.warn("Can not parse item volume %s", self.item_id) # 库存 :icTotalQuantity """" reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) m = RATECOUNT_RE.match(reviewInfoData) if m: self.reviewCount = m.group(1) else: self.reviewCount = None """ else: self.cid = get_val(self.data, "cid") apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace( r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) except: self.volume = -1 else: self.volume = -1 #interval = get_val(data2, 'interval') # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/") """ reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) self.reviewCount = get_val(reviewInfoData, 'total') """ except: logger.error("crawling %s unknown exception %s", self.item_id, traceback.format_exc(), extra={'tags': [ 'crawlItemException', ]}) raise
def crawl(self): try: self.data = self.crawl_page(self.url) if FLAGS.debug_parser: import pdb; pdb.set_trace() # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0: self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True self.detailDiv = self.html_obj.xpath("//div[@id='detail']") self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']") self.originPrice = self.html_obj.xpath("//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()") if not self.originPrice: self.originPrice = self.html_obj.xpath("//strong[@class='J_originalPrice']/text()") #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()") self.bidPrice = self.html_obj.xpath("//input[@name='current_price']/@value") self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not self.is_tmall and tmalllogo: self.is_tmall = True if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int(get_val(self.tmallInitApi, "sellCount")) except: logger.warn("Can not parse item volume %s", self.item_id) # 库存 :icTotalQuantity """" reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) m = RATECOUNT_RE.match(reviewInfoData) if m: self.reviewCount = m.group(1) else: self.reviewCount = None """ else: self.cid = get_val(self.data, "cid") apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) except: self.volume = -1 else: self.volume = -1 #interval = get_val(data2, 'interval') # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/") """ reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) self.reviewCount = get_val(reviewInfoData, 'total') """ except: logger.error("crawling %s unknown exception %s", self.item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]}) raise