class BagItemM(MyThread): '''A class of jhs item thread manager''' def __init__(self, home_url, brand_type, thread_num=10): # parent construct MyThread.__init__(self, thread_num) # db self.mysqlAccess = MysqlAccess() # mysql access # thread lock self.mutex = threading.Lock() self.home_url = home_url self.brand_type = brand_type # activity items self.items = [] # give up item, retry too many times self.giveup_items = [] def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def putItem(self, _item): self.put_q((0, _item)) def putItems(self, _items): for _item in _items: self.put_q((0, _item)) # To crawl retry def crawlRetry(self, _data): if not _data: return _retry, _val = _data _retry += 1 if _retry < Config.crawl_retry: _data = (_retry, _val) self.put_q(_data) else: self.push_back(self.giveup_items, _val) print "# retry too many times, no get item:", _val # To crawl item def crawl(self): while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 #print '# queue is empty', e break _val = _data[1] item = BagItem(self.home_url, self.brand_type) item.antPage(_val) self.push_back(self.items, item.outItem()) sql = item.outTuple() self.mysqlAccess.insert_item(sql) # 延时 time.sleep(0.1) # 通知queue, task结束 self.queue.task_done() except Exception as e: print 'Unknown exception crawl item :', e Common.traceback_log() self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() time.sleep(5)
class taobaoSearch(): '''A class of taobao search page''' def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # db self.mysqlAccess = MysqlAccess() # mysql access # 品牌官网链接 self.home_url = 'http://www.taobao.com' self.refers = None # 抓取商品列表 self.link_list = [] self.items = [] self.brand_names = ["艾沃","保宁","红太阳","常润","厨邦","大华","大同","大王","德馨斋","德阳","东古","凤球唛","福临门","高真","古龙","冠生园","广味源","广祥泰","龟甲万","国味威","海鸥","海天","好伴","禾然","和田宽","恒顺","湖西岛","湖羊","黄花园","南食召","吉成","济美","加加","金冠园","金兰","金狮","有味家","金苏","荆楚","景山","居易","聚百鲜","科沁万佳","孔膳坊","快鹿","阆中","老才臣","食味的初相","老蔡","老恒和","老潘头","李锦记","利民","六月鲜","春峰","龙菲","秋生饭堂","龙牌","隆昌","楼茂记","鲁花自然鲜","云上","禄荣","麻旺","美富达","美极","美味源","蒙古真","渔山隐","米吉真味","酿一村","盘溪","彭万春","浦源","奇峰","千禾","千鹤屋","粮赞","钱万隆","清净园","清香园","仁昌记","三不加","悦意","三和四美","博爱酵园","山古坊","膳府","膳之堂","盛田","四海","寺冈","苏美","太太乐","泰康","唐人基","唐世家","淘大","腾安","同珍","妥甸","拓东","丸天","万通","万字","味事达","五天","犀浦","仙家","先市","鲜渡","咸亨","香满园","小东字","笑厨","新宇","星湖","徐同泰","薛泰丰","扬名","尧记","肴易食","一统原创","一休屋","伊例家","宜赤必","优和","鱼味鲜","禹谟","玉堂","御酿坊","缘木记","粤香园","灶基","詹王","张家三嫂","长寿结","珍极","正信","正阳河","至味","致美斋","中邦","中冷泉","中调","珠江桥","梓山","自然集","佐参王","佐香园","中坝","天府","南吉","清湖","味华","佐餐王","一品江南","金顶","玉河","巧媳妇","齐鲁","梁山好汉","王家园子","食圣","山口","川鹰","德通","新汶","四海","德馨斋","玉兔","灯塔","仙鹤","宏林","贵族王中王","万和","口珍","同福永","威极","嘉美乐","天浩圆","铁鸟","恒裕","周太","海鸥","太阳岛","百花","小神厨","龙菲","太和","天一","美乐","三汇","通海","黑珍珠","百乐","吉鹤村","岭桥","瓦缸","味莼园","百花串","锦酿","福香居","铁石","石桥","清华","味邦","光华","罕王","营宝","非常","大有丰","沙陀","味味晓","云晓","巧妈妈","振龙","乾缘","稻香园","一品斋","孔雀","武大郎","绿芳","天赐","益彰","建洛","天口","一品江南","机轮","溢美堂","山乡","榕江","嘉乐美","万路通","肖大妈","争荣","仙源","敬义泰","昆湖","鼎兴","临江寺","迈进","玉和","通德","民天","胡玉美","楼茂记","鼎丰","古灯","槐茂","榕城","BB","汉记","松城","森江","美狮","龙华","启航","隆邦","新汶","四海","龙之味","北康","金玉兰","小二黑","吉成"] def getPage(self, url): position = 1 i = 1 i_url = url refers = self.home_url max_page = 10 size_page = 48 while i <= max_page: page = self.crawler.getData(i_url, refers) refers = i_url i_url = url + '&bcoffset=1&s=%s' % str(i*size_page) i += 1 if not page or page == '': print 'not find data url:',i_url time.sleep(4) continue m = re.search(r'<script>\s+g_page_config = ({.+?});.+?</script>', page, flags=re.S) if m: page_config = m.group(1) page_config_s = re.sub(r'\n+','',page_config) data = json.loads(page_config_s) if data.has_key("mods"): if data["mods"].has_key("itemlist"): itemlist = data["mods"]["itemlist"] if itemlist.has_key("data"): itemlist_data = itemlist["data"] if itemlist_data.has_key("auctions"): for item in itemlist_data["auctions"]: item_id = position m = re.search(r'id=(\d+)', item["detail_url"], flags=re.S) if m: item_id = m.group(1) item_sales = item["view_sales"] m = re.search(r'(\d+)', item["view_sales"], flags=re.S) if m: item_sales = m.group(1) print Common.time_s(Common.now()), position, item_id, item["raw_title"], item["view_price"], item_sales, item["user_id"], item["nick"], "http:" + item["detail_url"], "http:" + item["shopLink"] self.mysqlAccess.insert_item((Common.time_s(Common.now()), str(item_id), str(position), str(item["raw_title"]), str(item["view_price"]), str(item_sales), "http:" + item["detail_url"], item["user_id"], str(item["nick"]), "http:" + item["shopLink"])) position += 1 time.sleep(4) def getItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 #if len(self.link_list) > max_th: # m_itemsObj = BagItemM(self.home_url,self.brand_type, max_th) #else: # m_itemsObj = BagItemM(self.home_url,self.brand_type, len(self.link_list)) #m_itemsObj.createthread() #m_itemsObj.putItems(self.link_list) #m_itemsObj.run() #self.items.extend(m_itemsObj.items) def itemPage(self, val): print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))