def main(): urls = [ # 'http://djzone.taobao.com', # 'http://glorylife.taobao.com/', # 'http://sffs.tmall.com/', # 'http://yazhou.tmall.com/shop/view_shop.htm?frm=yiyao', # 'http://nansin.tmall.com/' # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=1597546113', # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=397259828', 'http://sanke.taobao.com' ] for url in urls: t = TopShop(url) d = t.getFullInfo() # d = t.getBasicInfo() try: print json.dumps(d, ensure_ascii=False, indent=4) # .decode('utf-8') print '' except: for (k, v) in d.items(): print k, \ (v or '').decode( 'utf-8', 'ignore').encode('GBK', 'ignore') # saveTopShopData(d) del t pass
def main(): urls = [ # 'http://djzone.taobao.com', # 'http://glorylife.taobao.com', # 'http://sffs.tmall.com/', # 'http://yazhou.tmall.com/shop/view_shop.htm?frm=yiyao', # 'http://nansin.tmall.com/' # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=1597546113', # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=1643225788', # 'http://peers.tmall.com', # 'http://ynshadi.tmall.com', # 'http://ali.tmall.com', # 'http://shop34757726.taobao.com', "http://store.taobao.com/shop/view_shop.htm?user_number_id=13987814", # 'http://shop59346695.taobao.com', # 'http://chowtaifook.tmall.com' ] for url in urls: t = TopShop(url) d = t.getFullInfo() # d = t.getBasicInfo() try: print json.dumps(d, ensure_ascii=False, indent=4) # .decode('utf-8') print "" except: for (k, v) in d.items(): print k, (v or "").decode("utf-8", "ignore").encode("GBK", "ignore") # saveTopShopData(d) del t pass
def getShopItemsOverview(url, page=10, count=200, reqinterval=0.2): ''' [field] siteId [field] shopId [field] userId [filed] iid 商品ID [field] itemName [field] itemLink [field] itemPic [field] itemPriceType [field] itemPrice [field] itemSales [field] itemRateNum ''' metadata = [] shopLink = TopShop(url).getBasicInfo()['shopLink'] url = shopLink + '?search=y&viewType=grid&orderType=_hotsell' siteId = shopId = userId = None page_cursor = count_cursor = 0 iid_list = [] while 1: if page_cursor >= page: break # r = request(url, requiredPropertyRegx=r'siteId', retries=10) r = requests.get(url) s = r.content if not siteId: siteId = _match(s, REGX[r'siteId']) shopId = _match(s, REGX['shopId']) userId = _match(s, REGX['userId']) for itemcontent in _matchallitems(s)[1]: if count_cursor >= count: break info = { 'iid': re.findall(r'id\s*\=\s*(\d+)', itemcontent), 'itemName': re.findall( r'\<div\s+class\s*\=\s*\"desc\"\s*\>\s*\<a\s+.+?\>\s*(.+?)\s*\<\/a\>', itemcontent, re.S), 'itemLink': re.findall(r'\<a\s+href\=\"(.+?)\"', itemcontent, re.S), 'itemPic': re.findall(r'\<img\s+data-ks-lazyload\=\"(.+?)\"', itemcontent, re.S), 'itemPriceType': re.findall( r'\<div\s+class\s*\=\s*\"price\"\s*\>\s*\<span\>\s*(.+?)\<\/span\s*\>', itemcontent, re.S), 'itemPrice': re.findall( r'\<div\s+class\s*\=\s*\"price\"\s*\>.+?\<strong\s*\>(\d+\.*\d*)', itemcontent, re.S), 'itemSales': re.findall( r'\<div\s+class\s*\=\s*\"sales\-amount\"\s*\>.+?\<em\>\s*(\d+)\s*\<', itemcontent, re.S), 'itemRateNum': re.findall( r'\<div\s+class\s*\=\s*\"rating\"\s*\>.+?\<a\s+.+?\>\D+(\d+).+?', itemcontent, re.S), } for (k, v) in info.items(): if len(v) > 0: info[k] = v[0].decode(r.encoding, 'ignore').encode('utf-8', 'ignore') else: info[k] = None if iid_list.count(info['iid']) > 0: continue metadata.append([siteId, shopId, userId] + [ info['iid'], info['itemName'], info['itemLink'], info['itemPic'], info['itemPriceType'], info['itemPrice'], info['itemSales'], info['itemRateNum'], ]) iid_list.append(info['iid']) count_cursor += 1 page_cursor += 1 hasNext = \ re.findall(r'\<a\s+class\=\"J\_SearchAsync\s+next\"\s+href\=\"(\S+?)\".?\>', s) if not hasNext: soup = BeautifulSoup(s, fromEncoding=r.encoding) hasNext = soup.findAll('a', {'class': 'page-next'}) or \ soup.findAll('a', {'class': re.compile(r'J\_SearchAsync\snext')}) hasNext = [hasNext[0]['href']] if hasNext else [] if len(hasNext) > 0: if url == hasNext[0]: break url = hasNext[0] url = re.sub(r'&', r'&', url) else: break time.sleep(reqinterval) return metadata