class ArmaniBag(): '''A class of armani bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.armani.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'armani' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self): tab_list = [ ("giorgio armani", "http://www.armani.cn/cn/giorgioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B" ), ("emporio armani", "http://www.armani.cn/cn/emporioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B" ), ("armani jeans", "http://www.armani.cn/cn/armanijeans/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B" ) ] for tab in tab_list: tab_name, tab_data_url = tab print '# tab:', tab_name, tab_data_url tab_page = self.crawler.getData(tab_data_url, self.home_url) p = re.compile( r'<div class="item hproduct".+?>.+?<a href="(.+?)".+?class="url">\s*<div class="hproductPhotoCont">\s*<img.+?(src|data-original)="(.+?)".*?/>\s*</div>\s*</a>\s*<div class="itemDesc">\s*<a.+?>\s*<h3.+?>(.+?)</h3>\s*</a>.+?<div class="itemPrice">.+?<span class="prezzoProdottoSaldo".*?>(.+?)</span>\s*</div>.+?</div>', flags=re.S) for item in p.finditer(tab_page): i_url, i_img, i_name, s_price = self.home_url + item.group( 1), item.group(2), item.group(4).strip(), item.group(5) print i_url, i_img, i_name, s_price i_unit = "" if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').strip() if i_url and i_url != '': self.link_list.append((tab_name, tab_data_url, i_name, i_url, i_img, i_price, i_unit)) else: i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) def bagItems(self): #for link in self.link_list: # self.itemPage(link) # break max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h2 class="productName">(.+?)</h2>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search( r'<div id="zoomImageWrapper">\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) else: m = re.search( r'<div id="thumbsWrapper">.+?<div class="thumbElement".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) m = re.search( r'<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>', page, flags=re.S) if m: currency, i_price = m.group(1), re.sub(r'<.*>', '', m.group(2)) if currency.find("¥") != -1: i_unit = "CNY" else: i_unit = currency m = re.search(r'<div class="attributes">(.+?)</div>', page, flags=re.S) if m: size_str = re.sub(r'<.*?>', '', m.group(1)) #i_size = "".join(size_str.split()) i_size = re.sub(r'\s*', '', size_str) print "".join(i_size.split()) i_number = '' m = re.search( r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem() i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) #print '# itemPage :', serie_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class GivenchyBag(): '''A class of givenchy bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.givenchy.com' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'givenchy' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile(r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append((tab.group(2)+tab.group(3).strip(),tab.group(1))) for tab in tab_list: tab_name,tab_url = tab print '# tab:',tab_name,tab_url tab_page = self.crawler.getData(tab_url, url) m = re.search(r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>', tab_page, flags=re.S) if m: ajax_url = self.home_url + m.group(1) + "?ajax=true&fragment=true" ajax_data = self.crawler.getData(ajax_url, tab_url) if ajax_data: #data = json.loads(ajax_data) #if data and data.has_key("html"): # print data["html"].decode("unicode-escape") r_data = ajax_data.decode("unicode-escape") if r_data: m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S) if m: data_html = m.group(1).replace("\/","/") #print data_html #break p = re.compile(r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>', flags=re.S) for item in p.finditer(data_html): i_url, i_img, s_number, i_name = self.home_url+item.group(3), item.group(4), item.group(2), re.sub(r'<.+?>','',item.group(1)).strip() i_number = '' m = re.search(r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>', s_number, flags=re.S) if m: i_number = m.group(1) print i_url, i_img, i_name, i_number if Common.isBag(i_name): self.link_list.append((tab_name, tab_url, i_name, i_url, i_img, i_number)) #i = BagItem(self.home_url, self.brand_type) #i.initItem(tab_name, '', i_name, '', '', '', i_url, i_img, i_number) #self.items.append(i.outItem()) def bagItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class DolcegabbanaBag(): '''A class of dolcegabbana bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.dolcegabbana.com.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'dolcegabbana' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return serie_title = '包袋' p = re.compile(r'<li data-position="\d+\s*" class="product isAvailable".+?data-category="(.+?)".+?>\s*<div class="prodContent"><div class="imagesContainer".+?>.+?<img.+?data-original="(.+?)".+?>.+?</div>\s*<div class="\s*productDescription\s*">\s*<a href="(.+?)".+?><h2.+?>(.+?)</h2>\s*</a>\s*<div class="price">.+?<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>.+?</li>', flags=re.S) for item in p.finditer(page): tab_name, i_img, i_url, i_name, s_unit, s_price = item.group(1).strip(),item.group(2),item.group(3),item.group(4).strip(),item.group(5),item.group(6) i_unit = "" if s_unit.find("¥") != -1: i_unit = "CNY" i_price = re.sub(r'<.+?>','',s_price).strip() print tab_name, i_img, self.home_url+i_url, i_name, i_unit, i_price if i_url and i_url != '': self.link_list.append((serie_title,tab_name,url,i_name,self.home_url+i_url,i_img,i_price,i_unit)) else: i = BagItem(self.brand_type) i.initItem(serie_title, tab_name, i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) page_num = 2 ajax_url = "http://www.dolcegabbana.com.cn/yeti/api/DOLCEEGABBANA_CN/searchIndented.json?page=2&sortRule=PriorityDescending&format=full&authorlocalized=¯o=1147µ=&color=&look=&size=&gender=D&season=P%2CE&department=&brand=&heel=&heeltype=&wedge=&washtype=&washcode=&colortype=&fabric=&waist=&family=&structure=&environment=&author=&textSearch=&minPrice=&maxPrice=&occasion=&salesline=&prints=&stone=&material=&agerange=&productsPerPage=20&gallery=¯oMarchio=&modelnames=&GroupBy=&style=&site=DOLCEEGABBANA&baseurl=http://www.dolcegabbana.com.cn/searchresult.asp" a_url = re.sub('page=\d+&', 'page=%d&'%page_num, ajax_url) a_page = self.crawler.getData(a_url, url) result = self.ajax_item(a_page, url) while result: page_num += 1 a_url = re.sub('page=\d+&', 'page=%d&'%page_num, ajax_url) a_page = self.crawler.getData(a_url, url) result = self.ajax_item(a_page, url) def ajax_item(self, page, refers): if not page or page == '': return False try: result = json.loads(page) if result.has_key("ApiResult"): r_ApiResult = result["ApiResult"] if r_ApiResult.has_key("Items"): for item in r_ApiResult["Items"]: tab_name, i_img, i_url, i_name, i_price = "", "", "", "", "" if item.has_key("MicroCategory"): tab_name = item["MicroCategory"].strip() if item.has_key("DefaultCode10"): item_code10 = item["DefaultCode10"] if item.has_key("ImageTypes"): if "12_f" in item["ImageTypes"]: i_img = "http://cdn.yoox.biz/55/%s_%s.jpg"%(item_code10,"12_f") else: i_img = "http://cdn.yoox.biz/55/%s_%s.jpg"%(item_code10,max(item["ImageTypes"])) if item.has_key("SingleSelectLink"): i_url = self.home_url + item["SingleSelectLink"].strip() if item.has_key("TitleAttribute"): i_name = item["TitleAttribute"].strip() if item.has_key("FullPrice"): i_price = '{0:,}'.format(int(item["FullPrice"])) i_unit = "CNY" print tab_name,i_name,i_url,i_img,i_price,i_unit if i_url and i_url != '': self.link_list.append((tab_name,refers,i_name,i_url,i_img,i_price,i_unit)) else: i = BagItem(self.brand_type) i.initItem('', tab_name, i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) if result.has_key("Page"): r_Page = result["Page"] if r_Page.has_key("CurrentSearchPage") and r_Page.has_key("TotalPages"): if int(r_Page["CurrentSearchPage"]) < int(r_Page["TotalPages"]): return True return False except Exception as e: print e return False def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): item_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div id="itemTechSheet">\s*<h1>(.+?)</h1>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<div id="itemTechSheet">.+?<div class="price">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1: i_unit = "CNY" i_price = re.sub(r'<.+?>','',s_price).replace('¥','').strip() m = re.search(r'<div id="itemImagesBox".*?>\s*<img.+?class="mainImage" src="(.+?)">', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺寸.+?)</li>', page, flags=re.S) if m: i_size = m.group(1) else: m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>', page, flags=re.S) if m: i_size = m.group(1) i_number = '' m = re.search(r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S) if m: i_number = m.group(1).split(':')[1].strip() i = BagItem(self.brand_type) i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class TMCrawler(): '''A class of TMall shop''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # db self.mysqlAccess = MysqlAccess() # mysql access # 品牌官网链接 self.home_url = 'http://www.taobao.com' self.refers = None # 抓取商品列表 self.link_list = [] self.items = [] self.begin_time = Common.now() def getPage(self, url, shop_home_url): position = 1 i = 1 max_page = 0 asyn_url = '' i_url = url refers = shop_home_url result_s = self.get_asyn_data(i_url, refers, shop_home_url) m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>', result_s, flags=re.S) if m: max_page = int(m.group(1)) print '# page num:', max_page while i <= max_page: m = re.search( r'<div class="J_TItems">(.+?)<div class="pagination">', result_s, flags=re.S) if m: items_s = m.group(1) p = re.compile( r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>' ) j = 1 for item in p.finditer(items_s): item_id, url_s, item_name, price_symbol, price = item.group( 1), item.group(2), Common.htmlDecode( item.group(3).strip()), item.group( 4).strip(), item.group(5).strip() if url_s.find('http') == -1: item_url = 'http:' + url_s else: item_url = url_s print '### item ###' print '# item val:', item_id, item_name, price, item_url item = Item() item.parserTM((item_id, item_name, price, item_url, i_url, self.begin_time)) print '# item info:', item.outItemSql() self.mysqlAccess.insert_parser_item_info(item.outItemSql()) time.sleep(2) refers = i_url if i_url.find('pageNo=') == -1: i_url = re.sub(r'&tsearch=y', '&pageNo=%d&tsearch=y#anchor' % i, refers) else: i_url = re.sub(r'&pageNo=\d+&', '&pageNo=%d&' % i, refers) i += 1 time.sleep(2) result_s = self.get_asyn_data(i_url, refers, shop_home_url) def get_asyn_data(self, i_url, refers, shop_home_url): result = '' result_s = '' page = self.crawler.getData(i_url, refers) m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>', page, flags=re.S) if m: ts = '?_ksTS=%s&callback=jsonp135&' % (str(int( time.time() * 1000)) + '_' + str(random.randint(100, 999))) a_url = shop_home_url + Common.htmlDecode(m.group(1)) asyn_url = re.sub('\?', ts, a_url) result = self.crawler.getData(asyn_url, i_url) m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S) if m: result_s = re.sub(r'\\"', '"', m.group(1)) return result_s def getItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 #if len(self.link_list) > max_th: # m_itemsObj = BagItemM(self.home_url,self.brand_type, max_th) #else: # m_itemsObj = BagItemM(self.home_url,self.brand_type, len(self.link_list)) #m_itemsObj.createthread() #m_itemsObj.putItems(self.link_list) #m_itemsObj.run() #self.items.extend(m_itemsObj.items) def itemPage(self, val): print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img
class BottegavenetaBag(): '''A class of bottegaveneta bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.bottegaveneta.com' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'bottegaveneta' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return p = re.compile(r'<div id="slot_\d+".+?<a.+?href="(.+?)".+?>\s*<img.+?src="(.+?)".+?/>\s*<div class="iteminfo">\s*<div class="headgroup">\s*<div class="extra">\s*<h1 class="modelname">(.+?)</h1>', flags=re.S) for item in p.finditer(page): i_url, i_img, i_name = item.group(1),item.group(2),item.group(3) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append(('',url,i_name,i_url,i_img)) else: i = BagItem(self.brand_type) i.initItem('',url,i_name,i_url,i_img) self.items.append(i.outItem()) p = re.compile(r'<div class="slot lazySlot".+?data-slot="(.+?)".+?>',flags=re.S) for item in p.finditer(page): data_info = item.group(1) data_info_str = data_info.replace('"','"') i_url, i_img, i_name = '', '', '' m = re.search(r'"Link":"(.+?)"', data_info_str, flags=re.S) if m: i_url = m.group(1) m = re.search(r'"ModelName":"(.+?)",', data_info_str, flags=re.S) if m: i_name = m.group(1) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append(('',url,i_name,i_url,i_img)) else: i = BagItem(self.brand_type) i.initItem('',url,i_name,i_url,i_img) self.items.append(i.outItem()) def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_img = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h1 class="producTitle".+?>\s*<div class="modelName".+?>\s*<span class="modelName">(.+?)</span>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<div class="mainImage".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) else: m = re.search(r'<section id="bgItem">\s*<img.+?src="(.+?)".*?/>\s*</section>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="localizedAttributes">.*?<div class="height">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r'<div class="localizedAttributes">.*?<div class="depth">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r'<div class="localizedAttributes">.*?<div class="length_of_strap">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r'<div class="localizedAttributes">.*?<div class="width">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" i_number m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) #print '# itemPage :', serie_title, i_name, i_url, i_img, i_size def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class BossBag(): '''A class of boss bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://store.hugoboss.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'boss' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_name = '手袋' self.parse_item(page,tab_name,url) def parse_item(self, tab_page, tab_name, tab_url): items_info = '' m = re.search(r'<div class="productlist-widget">.+?<div class="container">\s*<ul.+?>(.+?)</ul>', tab_page, flags=re.S) if m: items_info = m.group(1) self.run_items(items_info, tab_name, tab_url) def run_items(self, items_info, tab_name, tab_url): p = re.compile(r'<li class="productlist-item ">\s*<div class="product-image".+?>\s*<a.+?><img src="(.+?)".+?/>\s*</a>\s*</div>.+?<div class="product-title">\s*<a href="(.+?)".+?>(.+?)</a>\s*</div>.+?<p>\s*<span class="product-price">(.+?)</span>\s*</p>\s*</li>', flags=re.S) for item in p.finditer(items_info): i_img, i_url, i_name, s_price = item.group(1),item.group(2),item.group(3),item.group(4) i_price, i_unit = '', '' if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').replace('¥','').strip() if i_url and i_url != '': print self.home_url+i_url, i_img, i_name, i_price, i_unit self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit)) else: print self.home_url+i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) def isBag(self,name): bag_info = ["包","袋","皮夹","钱夹"] other_info = ["裤","T恤","衬衫","礼服","上衣"] for b_info in bag_info: if name.find(b_info) != -1: for o_info in other_info: if name.find(o_info) != -1: return False return True return False def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h1 class="product-name">(.+?)</h1>', page, flags=re.S) if m: i_name = ' '.join(m.group(1).strip().split()) m = re.search(r'<div class="product-prices">.+?<dd class="saleprice">(.+?)</dd>.+?</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').replace('¥','').strip() m = re.search(r'<div class="image">.+?<div class="container".*?>\s*<table.+?>\s*<tr>\s*<td>\s*<a.+?>\s*<img.+?class="thumb".+?big="(.+?)".*?/>\s*</a>\s*</td>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="tabpage inc".+?>.+?<span.*?>(尺寸大小.+?)</span>', page, flags=re.S) if m: s_size = m.group(1) i_size = s_size.split(':')[1] if i_size == '': m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>', page, flags=re.S) if m: i_size = re.sub(r'<.+?>','',m.group(1)) i_number = '' m = re.search(r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class Item(): '''A class of item''' def __init__(self): # crawler #self.crawler = TBCrawler() self.crawler = RetryCrawler() # shop self.shop_type = '1' # 店铺类型 self.seller_id = '' # 卖家ID self.seller_name = '' # 卖家Name self.shop_id = '' # 店铺ID self.shop_name = '' # 店铺Name self.shop_url = '' # 店铺URL # refers self.refers = '' # 成交抓取参数 self.deal_pageSize = 15 self.deal_maxPages = 100 self.deal_bufferdays = 3 # 往前追溯3天 # 评价抓取参数 self.rate_pageSize = 20 self.rate_maxPages = 100 # 初始化实例变量 self.initItem() def initItem(self): # 商品抓取设置 self.crawling_time = Common.now() self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 商品属性 self.item_id = '' # 商品ID self.item_name = '' # 商品名称 self.item_price = '' # 商品价格 self.item_url = '' # 商品链接 self.item_spuId = '' # SPU ID self.item_sellCount = 0 # 月销售数 self.brand_name = '' self.brand_id = '' self.category_id = '' # 商品页 self.item_page = None # 商品首页 # item html urls self.item_urls = [] # 商品链接列表 # item html pages #self.item_pages = [] # 商品网页列表 self.item_pages = {} # 商品网页列表 # 成交记录 self.deal_url = '' self.deal_stopCrawl = False self.deal_deadLine = 0.0 # 上次抓取的成交记录最晚时间 self.deal_deadLine2 = 0.0 # 本次抓取的成交记录最早时间 def TMItem(self): if self.item_url != '': page = self.crawler.getData(self.item_url, self.refers) if not page or page == '': raise Common.InvalidPageException( "# TMItem: not find item page,itemid:%s,item_url:%s" % (str(self.item_id), self.item_url)) m = re.search(r'sellerId:"(\d+)",', page, flags=re.S) if m: self.seller_id = m.group(1) m = re.search(r'shopId:"(\d+)",', page, flags=re.S) if m: self.shop_id = m.group(1) m = re.search( r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>', page, flags=re.S) if m: self.shop_url, self.shop_name = Common.fix_url( m.group(1)), m.group(2).strip() m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S) if m: TShop_s = m.group(1).strip() m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S) if m: self.brand_name = Common.htmlDecode(m.group(1).strip()) m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S) if m: self.brand_id = m.group(1) m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S) if m: self.category_id = m.group(1) m = re.search(r'"sellerNickName":"(.+?)",', TShop_s, flags=re.S) if m: self.seller_name = Common.urlDecode(m.group(1).strip()) m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S) if m: ts = "&callback=setMdskip×tamp=%s" % str( int(time.time() * 1000)) initapi_url = Common.fix_url(m.group(1).strip( )) + ts + "&ref=%s" % Common.urlCode(self.refers) init_page = self.crawler.getData(initapi_url, self.item_url) if not init_page and init_page == '': print '# init page is null..' else: m = re.search(r'"sellCountDO":{"sellCount":(\d+),', init_page, flags=re.S) if m: self.item_sellCount = m.group(1) def parserTM(self, val): self.item_id, self.item_name, self.item_price, self.item_url, self.refers, self.crawling_begintime = val # 本次抓取开始日期 self.crawling_beginDate = time.strftime( "%Y-%m-%d", time.localtime(self.crawling_begintime)) # 本次抓取开始小时 self.crawling_beginHour = time.strftime( "%H", time.localtime(self.crawling_begintime)) self.TMItem() # 输出抓取的网页sql def outItemSql(self): return (Common.time_s(self.crawling_time), self.item_id, self.item_name, self.item_price, self.item_sellCount, self.item_url, self.seller_id, self.seller_name, self.shop_id, self.shop_name, self.shop_url, self.brand_id, self.brand_name, self.category_id, self.crawling_beginDate, self.crawling_beginHour)
class DolcegabbanaBag(): '''A class of dolcegabbana bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.dolcegabbana.com.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'dolcegabbana' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return serie_title = '包袋' p = re.compile( r'<li data-position="\d+\s*" class="product isAvailable".+?data-category="(.+?)".+?>\s*<div class="prodContent"><div class="imagesContainer".+?>.+?<img.+?data-original="(.+?)".+?>.+?</div>\s*<div class="\s*productDescription\s*">\s*<a href="(.+?)".+?><h2.+?>(.+?)</h2>\s*</a>\s*<div class="price">.+?<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>.+?</li>', flags=re.S) for item in p.finditer(page): tab_name, i_img, i_url, i_name, s_unit, s_price = item.group( 1).strip(), item.group(2), item.group(3), item.group( 4).strip(), item.group(5), item.group(6) i_unit = "" if s_unit.find("¥") != -1: i_unit = "CNY" i_price = re.sub(r'<.+?>', '', s_price).strip() print tab_name, i_img, self.home_url + i_url, i_name, i_unit, i_price if i_url and i_url != '': self.link_list.append( (serie_title, tab_name, url, i_name, self.home_url + i_url, i_img, i_price, i_unit)) else: i = BagItem(self.brand_type) i.initItem(serie_title, tab_name, i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) page_num = 2 ajax_url = "http://www.dolcegabbana.com.cn/yeti/api/DOLCEEGABBANA_CN/searchIndented.json?page=2&sortRule=PriorityDescending&format=full&authorlocalized=¯o=1147µ=&color=&look=&size=&gender=D&season=P%2CE&department=&brand=&heel=&heeltype=&wedge=&washtype=&washcode=&colortype=&fabric=&waist=&family=&structure=&environment=&author=&textSearch=&minPrice=&maxPrice=&occasion=&salesline=&prints=&stone=&material=&agerange=&productsPerPage=20&gallery=¯oMarchio=&modelnames=&GroupBy=&style=&site=DOLCEEGABBANA&baseurl=http://www.dolcegabbana.com.cn/searchresult.asp" a_url = re.sub('page=\d+&', 'page=%d&' % page_num, ajax_url) a_page = self.crawler.getData(a_url, url) result = self.ajax_item(a_page, url) while result: page_num += 1 a_url = re.sub('page=\d+&', 'page=%d&' % page_num, ajax_url) a_page = self.crawler.getData(a_url, url) result = self.ajax_item(a_page, url) def ajax_item(self, page, refers): if not page or page == '': return False try: result = json.loads(page) if result.has_key("ApiResult"): r_ApiResult = result["ApiResult"] if r_ApiResult.has_key("Items"): for item in r_ApiResult["Items"]: tab_name, i_img, i_url, i_name, i_price = "", "", "", "", "" if item.has_key("MicroCategory"): tab_name = item["MicroCategory"].strip() if item.has_key("DefaultCode10"): item_code10 = item["DefaultCode10"] if item.has_key("ImageTypes"): if "12_f" in item["ImageTypes"]: i_img = "http://cdn.yoox.biz/55/%s_%s.jpg" % ( item_code10, "12_f") else: i_img = "http://cdn.yoox.biz/55/%s_%s.jpg" % ( item_code10, max(item["ImageTypes"])) if item.has_key("SingleSelectLink"): i_url = self.home_url + item[ "SingleSelectLink"].strip() if item.has_key("TitleAttribute"): i_name = item["TitleAttribute"].strip() if item.has_key("FullPrice"): i_price = '{0:,}'.format(int(item["FullPrice"])) i_unit = "CNY" print tab_name, i_name, i_url, i_img, i_price, i_unit if i_url and i_url != '': self.link_list.append( (tab_name, refers, i_name, i_url, i_img, i_price, i_unit)) else: i = BagItem(self.brand_type) i.initItem('', tab_name, i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) if result.has_key("Page"): r_Page = result["Page"] if r_Page.has_key("CurrentSearchPage") and r_Page.has_key( "TotalPages"): if int(r_Page["CurrentSearchPage"]) < int( r_Page["TotalPages"]): return True return False except Exception as e: print e return False def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): item_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div id="itemTechSheet">\s*<h1>(.+?)</h1>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search( r'<div id="itemTechSheet">.+?<div class="price">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1: i_unit = "CNY" i_price = re.sub(r'<.+?>', '', s_price).replace('¥', '').strip() m = re.search( r'<div id="itemImagesBox".*?>\s*<img.+?class="mainImage" src="(.+?)">', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺寸.+?)</li>', page, flags=re.S) if m: i_size = m.group(1) else: m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>', page, flags=re.S) if m: i_size = m.group(1) i_number = '' m = re.search( r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S) if m: i_number = m.group(1).split(':')[1].strip() i = BagItem(self.brand_type) i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class FerragamoBag(): '''A class of ferragamo bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.ferragamo.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'ferragamo' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_name = '女士手袋' self.parse_item(page, tab_name, url) def parse_item(self, tab_page, tab_name, tab_url): m = re.search( r'<div class="view-content">.+?<div class="page-wrapper-product">(.+?)</script>\s*</div>\s*</div>', tab_page, flags=re.S) if m: items_info = m.group(1) self.run_items(items_info, tab_name, tab_url) def run_items(self, items_info, tab_name, tab_url): p = re.compile( r'<div class="large.+?columns.+?">\s*<a href="(.+?)">\s*<img.*?src="(.+?)".*?/><span class="prodcaption">(.+?)</br>(.+?)</span>\s*</a>', flags=re.S) for item in p.finditer(items_info): i_url, i_img, i_name, s_price = item.group(1), item.group( 2), item.group(3), item.group(4) i_price, i_unit = '', '' if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').replace('¥', '').strip() if i_url and i_url != '': print self.home_url + i_url, i_img, i_name, i_price, i_unit self.link_list.append( (tab_name, tab_url, i_name, self.home_url + i_url, i_img, i_price, i_unit)) else: print self.home_url + i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div class="product-title">(.+?)</div>', page, flags=re.S) if m: i_name = ' '.join(m.group(1).strip().split()) m = re.search(r'<div class="product-prices">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').replace('¥', '').strip() m = re.search( r'<div class="item-list">\s*<ul.+?>\s*<li class="first">.+?<img.+?src="(.+?)".*?/>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' i_number = '' m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S) if m: i_size, i_number = m.group(1).strip(), m.group(2).strip() i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class DiorBag(): '''A class of dior bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.dior.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'dior' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<ul class="tabsList collections">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile( r'<li class=".+?">\s+<a href="(.+?)" data-magento_call_page="(.+?)".+?>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append( (tab.group(3).strip(), self.home_url + tab.group(2), url + tab.group(1))) for tab in tab_list: tab_name, tab_data_url, tab_url = tab print '# tab:', tab_name, tab_data_url, tab_url tab_page = self.crawler.getData(tab_data_url, url) p = re.compile( r'<li class="li-product.+?>\s+<a href="(.*?)" class="linkProduct">.+?<img src="(.+?)".+?/>.+?<span class="description".+?>.+?<span class="title">(.+?)</span>.+?</span>\s+</a>\s+</li>', flags=re.S) for item in p.finditer(tab_page): i_url, i_img, i_name = self.home_url + item.group( 1), self.home_url + item.group(2), item.group(3) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append( (tab_name, tab_url, i_name, i_url, i_img)) else: i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, '', '', '', i_url, self.home_url + i_img) self.items.append(i.outItem()) def bagItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url = val page = self.crawler.getData(i_url, refers) if not page or page == '': return i_title, i_img, i_size, i_price, i_unit = '', '', '', '', '' m = re.search(r'<h2 class="" itemprop="name">(.+?)<br />(.+?)</h2>', page, flags=re.S) if m: i_title = m.group(1).strip() m = re.search( r'<li class="firstThumbnails">\s+<a href="#" class="active".+?>\s+<img src="(.+?)" alt="" />\s+</a>\s+</li>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) m = re.search( r'<div class="modText">\s+<h4.+?>说明</h4>\s+<p>(.+?)</p>\s+</div>', page, flags=re.S) if m: i_desc = m.group(1) m = re.search(r'尺寸:(.+?)<br />', i_desc, flags=re.S) if m: i_size = m.group(1).strip() else: m = re.search(r'尺寸:(.+?)$', i_desc, flags=re.S) if m: i_size = m.group(1).strip() i_number = '' m = re.search( r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S) if m: s_number = m.group(1) i_number = s_number.split('-')[1].strip() i = BagItem() i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class FerragamoBag: """A class of ferragamo bag""" def __init__(self): # 抓取设置 # self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = "http://www.ferragamo.cn" self.price_url = "" self.refers = None # 品牌type self.brand_type = "ferragamo" # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == "": return tab_name = "女士手袋" self.parse_item(page, tab_name, url) def parse_item(self, tab_page, tab_name, tab_url): m = re.search( r'<div class="view-content">.+?<div class="page-wrapper-product">(.+?)</script>\s*</div>\s*</div>', tab_page, flags=re.S, ) if m: items_info = m.group(1) self.run_items(items_info, tab_name, tab_url) def run_items(self, items_info, tab_name, tab_url): p = re.compile( r'<div class="large.+?columns.+?">\s*<a href="(.+?)">\s*<img.*?src="(.+?)".*?/><span class="prodcaption">(.+?)</br>(.+?)</span>\s*</a>', flags=re.S, ) for item in p.finditer(items_info): i_url, i_img, i_name, s_price = item.group(1), item.group(2), item.group(3), item.group(4) i_price, i_unit = "", "" if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace("¥", "").replace("¥", "").strip() if i_url and i_url != "": print self.home_url + i_url, i_img, i_name, i_price, i_unit self.link_list.append((tab_name, tab_url, i_name, self.home_url + i_url, i_img, i_price, i_unit)) else: print self.home_url + i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, "", i_name, i_price, i_unit, "", i_url, i_img) self.items.append(i.outItem()) def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == "": return page = self.crawler.getData(i_url, refers) if not page or page == "": return m = re.search(r'<div class="product-title">(.+?)</div>', page, flags=re.S) if m: i_name = " ".join(m.group(1).strip().split()) m = re.search(r'<div class="product-prices">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace("¥", "").replace("¥", "").strip() m = re.search( r'<div class="item-list">\s*<ul.+?>\s*<li class="first">.+?<img.+?src="(.+?)".*?/>', page, flags=re.S ) if m: i_img = m.group(1) i_size = "" i_number = "" m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S) if m: i_size, i_number = m.group(1).strip(), m.group(2).strip() i = BagItem(self.brand_type) i.initItem(serie_title, "", i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print "# itemPage:", i.outItem() # self.items.append(i.outItem()) def outItems(self, f): s = "#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号" with open(f, "w") as f_item: self.items.insert(0, s) f_item.write("\n".join(self.items))
class taobaoSearch(): '''A class of taobao search page''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # db self.mysqlAccess = MysqlAccess() # mysql access # 品牌官网链接 self.home_url = 'http://www.taobao.com' self.refers = None # 抓取商品列表 self.link_list = [] self.items = [] self.brand_names = ["艾沃","保宁","红太阳","常润","厨邦","大华","大同","大王","德馨斋","德阳","东古","凤球唛","福临门","高真","古龙","冠生园","广味源","广祥泰","龟甲万","国味威","海鸥","海天","好伴","禾然","和田宽","恒顺","湖西岛","湖羊","黄花园","南食召","吉成","济美","加加","金冠园","金兰","金狮","有味家","金苏","荆楚","景山","居易","聚百鲜","科沁万佳","孔膳坊","快鹿","阆中","老才臣","食味的初相","老蔡","老恒和","老潘头","李锦记","利民","六月鲜","春峰","龙菲","秋生饭堂","龙牌","隆昌","楼茂记","鲁花自然鲜","云上","禄荣","麻旺","美富达","美极","美味源","蒙古真","渔山隐","米吉真味","酿一村","盘溪","彭万春","浦源","奇峰","千禾","千鹤屋","粮赞","钱万隆","清净园","清香园","仁昌记","三不加","悦意","三和四美","博爱酵园","山古坊","膳府","膳之堂","盛田","四海","寺冈","苏美","太太乐","泰康","唐人基","唐世家","淘大","腾安","同珍","妥甸","拓东","丸天","万通","万字","味事达","五天","犀浦","仙家","先市","鲜渡","咸亨","香满园","小东字","笑厨","新宇","星湖","徐同泰","薛泰丰","扬名","尧记","肴易食","一统原创","一休屋","伊例家","宜赤必","优和","鱼味鲜","禹谟","玉堂","御酿坊","缘木记","粤香园","灶基","詹王","张家三嫂","长寿结","珍极","正信","正阳河","至味","致美斋","中邦","中冷泉","中调","珠江桥","梓山","自然集","佐参王","佐香园","中坝","天府","南吉","清湖","味华","佐餐王","一品江南","金顶","玉河","巧媳妇","齐鲁","梁山好汉","王家园子","食圣","山口","川鹰","德通","新汶","四海","德馨斋","玉兔","灯塔","仙鹤","宏林","贵族王中王","万和","口珍","同福永","威极","嘉美乐","天浩圆","铁鸟","恒裕","周太","海鸥","太阳岛","百花","小神厨","龙菲","太和","天一","美乐","三汇","通海","黑珍珠","百乐","吉鹤村","岭桥","瓦缸","味莼园","百花串","锦酿","福香居","铁石","石桥","清华","味邦","光华","罕王","营宝","非常","大有丰","沙陀","味味晓","云晓","巧妈妈","振龙","乾缘","稻香园","一品斋","孔雀","武大郎","绿芳","天赐","益彰","建洛","天口","一品江南","机轮","溢美堂","山乡","榕江","嘉乐美","万路通","肖大妈","争荣","仙源","敬义泰","昆湖","鼎兴","临江寺","迈进","玉和","通德","民天","胡玉美","楼茂记","鼎丰","古灯","槐茂","榕城","BB","汉记","松城","森江","美狮","龙华","启航","隆邦","新汶","四海","龙之味","北康","金玉兰","小二黑","吉成"] def getPage(self, url): position = 1 i = 1 i_url = url refers = self.home_url max_page = 10 size_page = 48 while i <= max_page: page = self.crawler.getData(i_url, refers) refers = i_url i_url = url + '&bcoffset=1&s=%s' % str(i*size_page) i += 1 if not page or page == '': print 'not find data url:',i_url time.sleep(4) continue m = re.search(r'<script>\s+g_page_config = ({.+?});.+?</script>', page, flags=re.S) if m: page_config = m.group(1) page_config_s = re.sub(r'\n+','',page_config) data = json.loads(page_config_s) if data.has_key("mods"): if data["mods"].has_key("itemlist"): itemlist = data["mods"]["itemlist"] if itemlist.has_key("data"): itemlist_data = itemlist["data"] if itemlist_data.has_key("auctions"): for item in itemlist_data["auctions"]: item_id = position m = re.search(r'id=(\d+)', item["detail_url"], flags=re.S) if m: item_id = m.group(1) item_sales = item["view_sales"] m = re.search(r'(\d+)', item["view_sales"], flags=re.S) if m: item_sales = m.group(1) print Common.time_s(Common.now()), position, item_id, item["raw_title"], item["view_price"], item_sales, item["user_id"], item["nick"], "http:" + item["detail_url"], "http:" + item["shopLink"] self.mysqlAccess.insert_item((Common.time_s(Common.now()), str(item_id), str(position), str(item["raw_title"]), str(item["view_price"]), str(item_sales), "http:" + item["detail_url"], item["user_id"], str(item["nick"]), "http:" + item["shopLink"])) position += 1 time.sleep(4) def getItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 #if len(self.link_list) > max_th: # m_itemsObj = BagItemM(self.home_url,self.brand_type, max_th) #else: # m_itemsObj = BagItemM(self.home_url,self.brand_type, len(self.link_list)) #m_itemsObj.createthread() #m_itemsObj.putItems(self.link_list) #m_itemsObj.run() #self.items.extend(m_itemsObj.items) def itemPage(self, val): print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class ArmaniBag(): '''A class of armani bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.armani.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'armani' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self): tab_list = [ ("giorgio armani","http://www.armani.cn/cn/giorgioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"), ("emporio armani","http://www.armani.cn/cn/emporioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"), ("armani jeans","http://www.armani.cn/cn/armanijeans/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B")] for tab in tab_list: tab_name,tab_data_url = tab print '# tab:',tab_name,tab_data_url tab_page = self.crawler.getData(tab_data_url, self.home_url) p = re.compile(r'<div class="item hproduct".+?>.+?<a href="(.+?)".+?class="url">\s*<div class="hproductPhotoCont">\s*<img.+?(src|data-original)="(.+?)".*?/>\s*</div>\s*</a>\s*<div class="itemDesc">\s*<a.+?>\s*<h3.+?>(.+?)</h3>\s*</a>.+?<div class="itemPrice">.+?<span class="prezzoProdottoSaldo".*?>(.+?)</span>\s*</div>.+?</div>', flags=re.S) for item in p.finditer(tab_page): i_url, i_img, i_name, s_price = self.home_url+item.group(1), item.group(2), item.group(4).strip(), item.group(5) print i_url, i_img, i_name, s_price i_unit = "" if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': self.link_list.append((tab_name,tab_data_url,i_name,i_url,i_img,i_price,i_unit)) else: i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) def bagItems(self): #for link in self.link_list: # self.itemPage(link) # break max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h2 class="productName">(.+?)</h2>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<div id="zoomImageWrapper">\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) else: m = re.search(r'<div id="thumbsWrapper">.+?<div class="thumbElement".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) m = re.search(r'<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>', page, flags=re.S) if m: currency, i_price = m.group(1), re.sub(r'<.*>','',m.group(2)) if currency.find("¥") != -1: i_unit = "CNY" else: i_unit = currency m = re.search(r'<div class="attributes">(.+?)</div>', page, flags=re.S) if m: size_str = re.sub(r'<.*?>','',m.group(1)) #i_size = "".join(size_str.split()) i_size = re.sub(r'\s*','',size_str) print "".join(i_size.split()) i_number = '' m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem() i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) #print '# itemPage :', serie_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class Item(): '''A class of item''' def __init__(self): # crawler #self.crawler = TBCrawler() self.crawler = RetryCrawler() # shop self.shop_type = '1' # 店铺类型 self.seller_id = '' # 卖家ID self.seller_name = '' # 卖家Name self.shop_id = '' # 店铺ID self.shop_name = '' # 店铺Name self.shop_url = '' # 店铺URL # refers self.refers = '' # 成交抓取参数 self.deal_pageSize = 15 self.deal_maxPages = 100 self.deal_bufferdays = 3 # 往前追溯3天 # 评价抓取参数 self.rate_pageSize = 20 self.rate_maxPages = 100 # 初始化实例变量 self.initItem() def initItem(self): # 商品抓取设置 self.crawling_time = Common.now() self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 商品属性 self.item_id = '' # 商品ID self.item_name = '' # 商品名称 self.item_price = '' # 商品价格 self.item_url = '' # 商品链接 self.item_spuId = '' # SPU ID self.item_sellCount = 0 # 月销售数 self.brand_name = '' self.brand_id = '' self.category_id = '' # 商品页 self.item_page = None # 商品首页 # item html urls self.item_urls = [] # 商品链接列表 # item html pages #self.item_pages = [] # 商品网页列表 self.item_pages = {} # 商品网页列表 # 成交记录 self.deal_url = '' self.deal_stopCrawl = False self.deal_deadLine = 0.0 # 上次抓取的成交记录最晚时间 self.deal_deadLine2 = 0.0 # 本次抓取的成交记录最早时间 def TMItem(self): if self.item_url != '': page = self.crawler.getData(self.item_url, self.refers) if not page or page == '': raise Common.InvalidPageException("# TMItem: not find item page,itemid:%s,item_url:%s"%(str(self.item_id), self.item_url)) m = re.search(r'sellerId:"(\d+)",', page, flags=re.S) if m: self.seller_id = m.group(1) m = re.search(r'shopId:"(\d+)",', page, flags=re.S) if m: self.shop_id = m.group(1) m = re.search(r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>', page, flags=re.S) if m: self.shop_url, self.shop_name = Common.fix_url(m.group(1)), m.group(2).strip() m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S) if m: TShop_s = m.group(1).strip() m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S) if m: self.brand_name = Common.htmlDecode(m.group(1).strip()) m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S) if m: self.brand_id = m.group(1) m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S) if m: self.category_id = m.group(1) m = re.search(r'"sellerNickName":"(.+?)",', TShop_s, flags=re.S) if m: self.seller_name = Common.urlDecode(m.group(1).strip()) m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S) if m: ts = "&callback=setMdskip×tamp=%s" % str(int(time.time()*1000)) initapi_url = Common.fix_url(m.group(1).strip()) + ts + "&ref=%s" % Common.urlCode(self.refers) init_page = self.crawler.getData(initapi_url, self.item_url) if not init_page and init_page == '': print '# init page is null..' else: m = re.search(r'"sellCountDO":{"sellCount":(\d+),', init_page, flags=re.S) if m: self.item_sellCount = m.group(1) def parserTM(self, val): self.item_id, self.item_name, self.item_price, self.item_url, self.refers, self.crawling_begintime = val # 本次抓取开始日期 self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime)) # 本次抓取开始小时 self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime)) self.TMItem() # 输出抓取的网页sql def outItemSql(self): return (Common.time_s(self.crawling_time),self.item_id,self.item_name,self.item_price,self.item_sellCount,self.item_url,self.seller_id,self.seller_name,self.shop_id,self.shop_name,self.shop_url,self.brand_id,self.brand_name,self.category_id,self.crawling_beginDate,self.crawling_beginHour)
class LouisvuittonBag(): '''A class of louisvuitton bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.louisvuitton.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'louisvuitton' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list_info = '' m = re.search(r'<li class="mega-menu-item">\s*<div.+?data-megaMenu="women".*?>\s*<span class="onlyML">手袋</span>.+?</div>\s*<div class="mega-menu-content">.+?<ul class="level3">(.+?)</ul>', page, flags=re.S) if m: tab_list_info = m.group(1).strip() tab_list = [] p = re.compile(r'<li class="mm-block">.+?<a.+?href="(.+?)">\s*<p class="mm-push-p">(.+?)</p>\s*</a>.+?</li>', flags=re.S) for tab_info in p.finditer(tab_list_info): tab_list.append((tab_info.group(2).strip(),self.home_url+tab_info.group(1))) print tab_info.group(2).strip(),self.home_url+tab_info.group(1) i = 0 for tab in tab_list: refers = url tab_name, tab_url = tab print '# tab:',tab_name, tab_url tab_page = self.crawler.getData(tab_url, refers) m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S) while m: refers = tab_url tab_url = re.sub(r'/to-\d+', '', tab_url) + "/to-%s"%m.group(1) tab_page = self.crawler.getData(tab_url, refers) m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S) p = re.compile(r'<a id="sku_.*?" href="(.+?)".+?>.+?<div class="description">\s*<div class="productName toMinimize">(.+?)</div>\s*<div class="productPrice.+?data-htmlContent="(.+?)">\s*</div>\s*</div>', flags=re.S) for item in p.finditer(tab_page): i_url, i_name, s_price = item.group(1),item.group(2),item.group(3) print self.home_url+i_url, i_name, s_price i_unit = "" if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': if Common.isBag(i_name): self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_price,i_unit)) else: if Common.isBag(i_name): i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, '') self.items.append(i.outItem()) def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div class="productName title" id="productName">\s*<h1 itemprop="name">(.+?)</h1>\s*</div>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<table class="priceButton">\s*<tr>\s*<td class="priceValue price-sheet">(.+?)</td>', page, flags=re.S) if m: i_price = m.group(1).strip() if i_price.find("¥") != -1: i_unit = "CNY" m = re.search(r'<noscript>\s*<img src="(.+?)".+?itemprop="image".*?/>\s*</noscript', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="textClientInfo exp_content".*?>\s*<div class="innerContent functional-text">(.+?)</div>', page, flags=re.S) if m: s_content = m.group(1).replace(' ','').strip() if s_content.find('宽)') != -1: s_size = s_content.split('宽)')[0] self.item_size = re.sub('<.+?>','',s_size) + "宽)" elif s_content.find('高)') != -1: s_size = s_content.split('高)')[0] self.item_size = re.sub('<.+?>','',s_size) + "高)" else: s_size = ''.join(s_content.split()) i_number m = re.search(r'<h2 class="sku reading-and-link-text">(.+?)</h2>', page, flags=re.S) if m: i_number = m.group(1).strip() else: m = re.search(r'<meta itemprop="identifier" content="sku:(.+?)"/>', page, flags=re.S) if m: i_number = m.group(1).strip() i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class BossBag(): '''A class of boss bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://store.hugoboss.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'boss' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_name = '手袋' self.parse_item(page, tab_name, url) def parse_item(self, tab_page, tab_name, tab_url): items_info = '' m = re.search( r'<div class="productlist-widget">.+?<div class="container">\s*<ul.+?>(.+?)</ul>', tab_page, flags=re.S) if m: items_info = m.group(1) self.run_items(items_info, tab_name, tab_url) def run_items(self, items_info, tab_name, tab_url): p = re.compile( r'<li class="productlist-item ">\s*<div class="product-image".+?>\s*<a.+?><img src="(.+?)".+?/>\s*</a>\s*</div>.+?<div class="product-title">\s*<a href="(.+?)".+?>(.+?)</a>\s*</div>.+?<p>\s*<span class="product-price">(.+?)</span>\s*</p>\s*</li>', flags=re.S) for item in p.finditer(items_info): i_img, i_url, i_name, s_price = item.group(1), item.group( 2), item.group(3), item.group(4) i_price, i_unit = '', '' if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').replace('¥', '').strip() if i_url and i_url != '': print self.home_url + i_url, i_img, i_name, i_price, i_unit self.link_list.append( (tab_name, tab_url, i_name, self.home_url + i_url, i_img, i_price, i_unit)) else: print self.home_url + i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) def isBag(self, name): bag_info = ["包", "袋", "皮夹", "钱夹"] other_info = ["裤", "T恤", "衬衫", "礼服", "上衣"] for b_info in bag_info: if name.find(b_info) != -1: for o_info in other_info: if name.find(o_info) != -1: return False return True return False def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h1 class="product-name">(.+?)</h1>', page, flags=re.S) if m: i_name = ' '.join(m.group(1).strip().split()) m = re.search( r'<div class="product-prices">.+?<dd class="saleprice">(.+?)</dd>.+?</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').replace('¥', '').strip() m = re.search( r'<div class="image">.+?<div class="container".*?>\s*<table.+?>\s*<tr>\s*<td>\s*<a.+?>\s*<img.+?class="thumb".+?big="(.+?)".*?/>\s*</a>\s*</td>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search( r'<div class="tabpage inc".+?>.+?<span.*?>(尺寸大小.+?)</span>', page, flags=re.S) if m: s_size = m.group(1) i_size = s_size.split(':')[1] if i_size == '': m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>', page, flags=re.S) if m: i_size = re.sub(r'<.+?>', '', m.group(1)) i_number = '' m = re.search( r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class DiorBag(): '''A class of dior bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.dior.cn' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'dior' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<ul class="tabsList collections">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile(r'<li class=".+?">\s+<a href="(.+?)" data-magento_call_page="(.+?)".+?>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append((tab.group(3).strip(),self.home_url+tab.group(2),url+tab.group(1))) for tab in tab_list: tab_name,tab_data_url,tab_url = tab print '# tab:',tab_name,tab_data_url,tab_url tab_page = self.crawler.getData(tab_data_url, url) p = re.compile(r'<li class="li-product.+?>\s+<a href="(.*?)" class="linkProduct">.+?<img src="(.+?)".+?/>.+?<span class="description".+?>.+?<span class="title">(.+?)</span>.+?</span>\s+</a>\s+</li>', flags=re.S) for item in p.finditer(tab_page): i_url, i_img, i_name = self.home_url+item.group(1), self.home_url+item.group(2), item.group(3) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append((tab_name,tab_url,i_name,i_url,i_img)) else: i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, '', '', '', i_url, self.home_url+i_img) self.items.append(i.outItem()) def bagItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url,self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url,self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url = val page = self.crawler.getData(i_url, refers) if not page or page == '': return i_title, i_img, i_size, i_price, i_unit = '', '', '', '', '' m = re.search(r'<h2 class="" itemprop="name">(.+?)<br />(.+?)</h2>', page, flags=re.S) if m: i_title = m.group(1).strip() m = re.search(r'<li class="firstThumbnails">\s+<a href="#" class="active".+?>\s+<img src="(.+?)" alt="" />\s+</a>\s+</li>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) m = re.search(r'<div class="modText">\s+<h4.+?>说明</h4>\s+<p>(.+?)</p>\s+</div>', page, flags=re.S) if m: i_desc = m.group(1) m = re.search(r'尺寸:(.+?)<br />', i_desc, flags=re.S) if m: i_size = m.group(1).strip() else: m = re.search(r'尺寸:(.+?)$', i_desc, flags=re.S) if m: i_size = m.group(1).strip() i_number = '' m = re.search(r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S) if m: s_number = m.group(1) i_number = s_number.split('-')[1].strip() i = BagItem() i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class ChanelBag(): '''A class of chanel bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.chanel.com' self.price_url = 'http://ws.chanel.com/pricing/zh_CN/fashion/%s/?i_client_interface=fsh_v3_misc&i_locale=zh_CN&format=json&callback=localJsonpPricingCallback' self.refers = None # 品牌type self.brand_type = 'chanel' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return f_url = 'http://www.chanel.com/zh_CN/fashion/products/handbags/g.metiers-d-art-paris-salzburg-2014-15.c.15A.html' m = re.search(r'<div class="category-container category-black".+?>.+?<ul>\s*<li>\s*<a class="no-select" target="_blank" href="(.+?)" data-cat="查看系列">', page, flags=re.S) if m: f_url = self.home_url + m.group(1) page = self.crawler.getData(f_url, url) if not page or page == '': return tab_list = [] m = re.search(r'<div class="mosaic-nav">\s*<ul class="nav">(.+?)</ul>', page, flags=re.S) if m: tab_list_info = m.group(1) p = re.compile(r'<li class="no-select nav-item">\s*<a.+?href="(.+?)">\s*<h2>(.+?)</h2>\s*</a>', flags=re.S) for tab_info in p.finditer(tab_list_info): tab_url, tab_name = tab_info.group(1), re.sub(r'<.+?>','',tab_info.group(2).replace(' ',' ').strip()) if tab_url == '#': tab_list.append((tab_name, f_url)) else: tab_list.append((tab_name, self.home_url+tab_url)) refers = url for tab in tab_list: tab_name, tab_url = tab print '# tab:',tab_name, tab_url tab_page = self.crawler.getData(tab_url, refers) refers = tab_url m = re.search(r'"products":\s*(\[.+?\])\s*},\s*"deeplink"', page, flags=re.S) if m: prods = m.group(1) js_items= json.loads(prods) for js_item in js_items: i_title = js_item["title"].replace(' ',' ') items = js_item["items"] for item in items: if not item.has_key("title"): continue i_name = item["title"].replace(' ',' ') i_url = self.home_url + item["href"] print tab_name, i_title, i_name, i_url self.link_list.append((tab_name, i_title, tab_url, i_name, i_url)) def bagItems(self): #for link in self.links: self.itemPage(link) max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url,self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url,self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, i_title, refers, i_name, i_url = val page = self.crawler.getData(i_url, refers) if not page or page == '': return i_name, i_img, ref_price, i_size, i_price, i_unit, i_number = '', '', '', '', '', '', '' m = re.search(r'<div class="product-details details".*?>.+?<p class="description info.*?">(.+?)</p>', page, flags=re.S) if m: i_name = re.sub(r'<.+?>', '', m.group(1)).strip() else: m = re.search(r'<title>(.+?)</title>', page, flags=re.S) if m: i_name = m.group(1).split('-')[0].strip() m = re.search(r'<div class="productimage.*?"><img src="(.+?)".*?/>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) p = re.compile(r'<p class="size info">(.+?)</p>', flags=re.S) for size in p.finditer(page): if self.item_size != '': i_size += '-' + size.group(1) else: i_size = size.group(1) #m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S) #if m: p = re.compile(r'<div class="ref info">\s*<p>(.+?)</p>', flags=re.S) for number in p.finditer(page): item_number = number.group(1) if self.item_number != '': self.item_number += '-' + item_number else: self.item_number = item_number refs = item_number.split(' ')[:-1] ref_price = ''.join(refs) p_url = self.price_url %ref_price data = self.crawler.getData(p_url, i_url) if not data or data == '': return # 抽取json报文 r = re.search(r'localJsonpPricingCallback\(\[(.+?)\]\)', data, flags=re.S) if r: price, unit = '', '' try: js_data = json.loads(r.group(1)) price, unit = js_data["price"]["amount"], js_data["price"]["currency-symbol"] except Exception as e: m = re.search(r'"amount":"(.+?)"', data, flags=re.S) if m: price = m.group(1) m = re.search(r'"currency-symbol":"(.+?)"', data, flags=re.S) if m: unit = m.group(1) if self.item_price != '': if price: i_price += '-' + price else: if price: i_price = price if unit: i_unit = unit i = BagItem(self.brand_type) i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) #print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class BottegavenetaBag(): '''A class of bottegaveneta bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.bottegaveneta.com' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'bottegaveneta' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return p = re.compile( r'<div id="slot_\d+".+?<a.+?href="(.+?)".+?>\s*<img.+?src="(.+?)".+?/>\s*<div class="iteminfo">\s*<div class="headgroup">\s*<div class="extra">\s*<h1 class="modelname">(.+?)</h1>', flags=re.S) for item in p.finditer(page): i_url, i_img, i_name = item.group(1), item.group(2), item.group(3) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append(('', url, i_name, i_url, i_img)) else: i = BagItem(self.brand_type) i.initItem('', url, i_name, i_url, i_img) self.items.append(i.outItem()) p = re.compile(r'<div class="slot lazySlot".+?data-slot="(.+?)".+?>', flags=re.S) for item in p.finditer(page): data_info = item.group(1) data_info_str = data_info.replace('"', '"') i_url, i_img, i_name = '', '', '' m = re.search(r'"Link":"(.+?)"', data_info_str, flags=re.S) if m: i_url = m.group(1) m = re.search(r'"ModelName":"(.+?)",', data_info_str, flags=re.S) if m: i_name = m.group(1) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append(('', url, i_name, i_url, i_img)) else: i = BagItem(self.brand_type) i.initItem('', url, i_name, i_url, i_img) self.items.append(i.outItem()) def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_img = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search( r'<h1 class="producTitle".+?>\s*<div class="modelName".+?>\s*<span class="modelName">(.+?)</span>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search( r'<div class="mainImage".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) else: m = re.search( r'<section id="bgItem">\s*<img.+?src="(.+?)".*?/>\s*</section>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search( r'<div class="localizedAttributes">.*?<div class="height">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r'<div class="localizedAttributes">.*?<div class="depth">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r'<div class="localizedAttributes">.*?<div class="length_of_strap">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r'<div class="localizedAttributes">.*?<div class="width">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" i_number m = re.search( r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem() #self.items.append(i.outItem()) #print '# itemPage :', serie_title, i_name, i_url, i_img, i_size def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class GivenchyBag(): '''A class of givenchy bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.givenchy.com' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'givenchy' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile( r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append( (tab.group(2) + tab.group(3).strip(), tab.group(1))) for tab in tab_list: tab_name, tab_url = tab print '# tab:', tab_name, tab_url tab_page = self.crawler.getData(tab_url, url) m = re.search( r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>', tab_page, flags=re.S) if m: ajax_url = self.home_url + m.group( 1) + "?ajax=true&fragment=true" ajax_data = self.crawler.getData(ajax_url, tab_url) if ajax_data: #data = json.loads(ajax_data) #if data and data.has_key("html"): # print data["html"].decode("unicode-escape") r_data = ajax_data.decode("unicode-escape") if r_data: m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S) if m: data_html = m.group(1).replace("\/", "/") #print data_html #break p = re.compile( r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>', flags=re.S) for item in p.finditer(data_html): i_url, i_img, s_number, i_name = self.home_url + item.group( 3), item.group(4), item.group(2), re.sub( r'<.+?>', '', item.group(1)).strip() i_number = '' m = re.search( r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>', s_number, flags=re.S) if m: i_number = m.group(1) print i_url, i_img, i_name, i_number if Common.isBag(i_name): self.link_list.append( (tab_name, tab_url, i_name, i_url, i_img, i_number)) #i = BagItem(self.home_url, self.brand_type) #i.initItem(tab_name, '', i_name, '', '', '', i_url, i_img, i_number) #self.items.append(i.outItem()) def bagItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class YslBag(): '''A class of ysl bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.ysl.com' self.price_url = '' self.refers = None # 品牌type self.brand_type = 'ysl' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<ul class="shopGender sep"><li class="shopWoman level1".+?>(.+?)</li><li class="shopMan level1"', page, flags=re.S) if m: tab_list_info = m.group(1) p = re.compile(r'<li data-dept=".+?".*?><a href="(.+?)">(.+?)</a>.*?</li>', flags=re.S) for tab_info in p.finditer(tab_list_info): tab_url, tab_name = tab_info.group(1), tab_info.group(2).strip() print tab_url, tab_name tab_list.append((tab_name, tab_url)) if tab_name == '手提袋': break #tab_list = [("手提袋","http://www.ysl.com/wy/shop-product/%E5%A5%B3%E5%A3%AB/%E6%89%8B%E6%8F%90%E8%A2%8B")] i = 0 for tab in tab_list: refers = url tab_name, tab_url = tab print '# tab:',tab_name, tab_url tab_page = self.crawler.getData(tab_url, refers) self.parse_item(tab_page,tab_name,tab_url) m = re.search(r'<div class="pagnum".+?>(.+?)</div>', tab_page, flags=re.S) if m: pagelist_info = m.group(1) p = re.compile(r'<a href="(.+?)">\d+</a>', flags=re.S) page_list = [] for page in p.finditer(pagelist_info): page_info = self.crawler.getData(page.group(1), tab_url) #print page.group(1) self.parse_item(page_info,tab_name,tab_url) def parse_item(self, tab_page, tab_name, tab_url): items_info = '' m = re.search(r'<div id="productsContainer".+?>(.+?)</div>\s*</div>\s*</div>', tab_page, flags=re.S) if m: items_info = m.group(1) self.run_items(items_info, tab_name, tab_url) else: m = re.search(r'<div class="productsFromGrid.+?>(.+?)</div>\s*</div>\s*</div>\s*</div>', tab_page, flags=re.S) if m: items_info = m.group(1) self.run_items(items_info, tab_name, tab_url) def run_items(self, items_info, tab_name, tab_url): p = re.compile(r'<.+?>\s*<a.+?>\s*<img.+?src="(.+?)".*?>\s*</a><a href="(.+?)">.+?<div class="infoDescription">(.+?)</div>\s*(<div class="infoPrice">.+?</div>).+?</a>\s*<.+?>', flags=re.S) for item in p.finditer(items_info): i_img, i_url, s_name, price_info = item.group(1),item.group(2),item.group(3),item.group(4) i_name = re.sub(r'<.+?>','',s_name) i_price, i_unit = '', '' m = re.search(r'<div.+?class="newprice">(.+?)</div>', price_info, flags=re.S) if m: s_price = re.sub(r'<.+?>','',m.group(1)) if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': if Common.isBag(i_name) or Common.isBag(unquote(i_url)): print self.home_url+i_url, i_img, i_name, i_price, i_unit self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit)) else: if Common.isBag(i_name) or Common.isBag(unquote(i_url)): print self.home_url+i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) def bagItems(self): """ i = 0 for link in self.link_list: self.itemPage(link) i += 1 if i == 1: break """ max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div id="itemInfo">.+?<h1><span class="customItemDescription" itemprop="name">(.+?)</span></h1>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<div id="itemPrice".+?><div.*?class="newprice">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() s_price = re.sub(r'<.+?>','',s_price) if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() else: i_price = s_price m = re.search(r'<div id="mainImageContainer"><img.+?src="(.+?)".*?/></div>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="itemDimensions">.+?<span class="dimensions">(.+?)</span></div>', page, flags=re.S) if m: i_size = m.group(1) i_number m = re.search(r'<div class="styleIdDescription">货号.+?<span.*?>(.+?)</span></div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class TMCrawler(): '''A class of TMall shop''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # db self.mysqlAccess = MysqlAccess() # mysql access # 品牌官网链接 self.home_url = 'http://www.taobao.com' self.refers = None # 抓取商品列表 self.link_list = [] self.items = [] self.begin_time = Common.now() def getPage(self, url, shop_home_url): position = 1 i = 1 max_page = 0 asyn_url = '' i_url = url refers = shop_home_url result_s = self.get_asyn_data(i_url, refers, shop_home_url) m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>', result_s, flags=re.S) if m: max_page = int(m.group(1)) print '# page num:', max_page while i <= max_page: m = re.search(r'<div class="J_TItems">(.+?)<div class="pagination">', result_s, flags=re.S) if m: items_s = m.group(1) p = re.compile(r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>') j = 1 for item in p.finditer(items_s): item_id, url_s, item_name, price_symbol, price = item.group(1), item.group(2), Common.htmlDecode(item.group(3).strip()), item.group(4).strip(), item.group(5).strip() if url_s.find('http') == -1: item_url = 'http:' + url_s else: item_url = url_s print '### item ###' print '# item val:', item_id, item_name, price, item_url item = Item() item.parserTM((item_id, item_name, price, item_url, i_url, self.begin_time)) print '# item info:',item.outItemSql() self.mysqlAccess.insert_parser_item_info(item.outItemSql()) time.sleep(2) refers = i_url if i_url.find('pageNo=') == -1: i_url = re.sub(r'&tsearch=y','&pageNo=%d&tsearch=y#anchor' % i, refers) else: i_url = re.sub(r'&pageNo=\d+&','&pageNo=%d&' % i, refers) i += 1 time.sleep(2) result_s = self.get_asyn_data(i_url, refers, shop_home_url) def get_asyn_data(self, i_url, refers, shop_home_url): result = '' result_s = '' page = self.crawler.getData(i_url, refers) m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>', page, flags=re.S) if m: ts = '?_ksTS=%s&callback=jsonp135&' % (str(int(time.time()*1000)) + '_' + str(random.randint(100,999))) a_url = shop_home_url + Common.htmlDecode(m.group(1)) asyn_url = re.sub('\?', ts, a_url) result = self.crawler.getData(asyn_url, i_url) m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S) if m: result_s = re.sub(r'\\"', '"', m.group(1)) return result_s def getItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 #if len(self.link_list) > max_th: # m_itemsObj = BagItemM(self.home_url,self.brand_type, max_th) #else: # m_itemsObj = BagItemM(self.home_url,self.brand_type, len(self.link_list)) #m_itemsObj.createthread() #m_itemsObj.putItems(self.link_list) #m_itemsObj.run() #self.items.extend(m_itemsObj.items) def itemPage(self, val): print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img
class ChanelBag(): '''A class of chanel bag''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # 品牌官网链接 self.home_url = 'http://www.chanel.com' self.price_url = 'http://ws.chanel.com/pricing/zh_CN/fashion/%s/?i_client_interface=fsh_v3_misc&i_locale=zh_CN&format=json&callback=localJsonpPricingCallback' self.refers = None # 品牌type self.brand_type = 'chanel' # 抓取商品列表 self.link_list = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return f_url = 'http://www.chanel.com/zh_CN/fashion/products/handbags/g.metiers-d-art-paris-salzburg-2014-15.c.15A.html' m = re.search( r'<div class="category-container category-black".+?>.+?<ul>\s*<li>\s*<a class="no-select" target="_blank" href="(.+?)" data-cat="查看系列">', page, flags=re.S) if m: f_url = self.home_url + m.group(1) page = self.crawler.getData(f_url, url) if not page or page == '': return tab_list = [] m = re.search(r'<div class="mosaic-nav">\s*<ul class="nav">(.+?)</ul>', page, flags=re.S) if m: tab_list_info = m.group(1) p = re.compile( r'<li class="no-select nav-item">\s*<a.+?href="(.+?)">\s*<h2>(.+?)</h2>\s*</a>', flags=re.S) for tab_info in p.finditer(tab_list_info): tab_url, tab_name = tab_info.group(1), re.sub( r'<.+?>', '', tab_info.group(2).replace(' ', ' ').strip()) if tab_url == '#': tab_list.append((tab_name, f_url)) else: tab_list.append((tab_name, self.home_url + tab_url)) refers = url for tab in tab_list: tab_name, tab_url = tab print '# tab:', tab_name, tab_url tab_page = self.crawler.getData(tab_url, refers) refers = tab_url m = re.search(r'"products":\s*(\[.+?\])\s*},\s*"deeplink"', page, flags=re.S) if m: prods = m.group(1) js_items = json.loads(prods) for js_item in js_items: i_title = js_item["title"].replace(' ', ' ') items = js_item["items"] for item in items: if not item.has_key("title"): continue i_name = item["title"].replace(' ', ' ') i_url = self.home_url + item["href"] print tab_name, i_title, i_name, i_url self.link_list.append( (tab_name, i_title, tab_url, i_name, i_url)) def bagItems(self): #for link in self.links: self.itemPage(link) max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def itemPage(self, val): serie_title, i_title, refers, i_name, i_url = val page = self.crawler.getData(i_url, refers) if not page or page == '': return i_name, i_img, ref_price, i_size, i_price, i_unit, i_number = '', '', '', '', '', '', '' m = re.search( r'<div class="product-details details".*?>.+?<p class="description info.*?">(.+?)</p>', page, flags=re.S) if m: i_name = re.sub(r'<.+?>', '', m.group(1)).strip() else: m = re.search(r'<title>(.+?)</title>', page, flags=re.S) if m: i_name = m.group(1).split('-')[0].strip() m = re.search(r'<div class="productimage.*?"><img src="(.+?)".*?/>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) p = re.compile(r'<p class="size info">(.+?)</p>', flags=re.S) for size in p.finditer(page): if self.item_size != '': i_size += '-' + size.group(1) else: i_size = size.group(1) #m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S) #if m: p = re.compile(r'<div class="ref info">\s*<p>(.+?)</p>', flags=re.S) for number in p.finditer(page): item_number = number.group(1) if self.item_number != '': self.item_number += '-' + item_number else: self.item_number = item_number refs = item_number.split(' ')[:-1] ref_price = ''.join(refs) p_url = self.price_url % ref_price data = self.crawler.getData(p_url, i_url) if not data or data == '': return # 抽取json报文 r = re.search(r'localJsonpPricingCallback\(\[(.+?)\]\)', data, flags=re.S) if r: price, unit = '', '' try: js_data = json.loads(r.group(1)) price, unit = js_data["price"]["amount"], js_data["price"][ "currency-symbol"] except Exception as e: m = re.search(r'"amount":"(.+?)"', data, flags=re.S) if m: price = m.group(1) m = re.search(r'"currency-symbol":"(.+?)"', data, flags=re.S) if m: unit = m.group(1) if self.item_price != '': if price: i_price += '-' + price else: if price: i_price = price if unit: i_unit = unit i = BagItem(self.brand_type) i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) #print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))