def __init__(self): # 抓取设置 self.crawler = XCCrawler() self.retrycrawler = RetryCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_time_s = Common.time_s(self.crawling_time) self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 频道信息 self.platform = '携程-pc' # 品牌团所在平台 self.channel_id = '' # 频道id self.channel_url = '' # 频道链接 self.channel_name = '' # 频道name self.channel_type = '' # 频道类型 # 频道所属地理位置信息 self.province_id = 0 # 省,州id self.province_name = '' # 省,州名称 # 原数据信息 self.channel_page = '' # 频道页面html内容 self.channel_pages = {} # 频道页面内请求数据列表 # channel items self.channel_items = [] # channel list self.channel_list = []
def __init__(self): # mysql self.mysqlAccess = MysqlAccess() # 抓取设置 #self.crawler = TBCrawler() self.crawler = RetryCrawler() # 页面模板解析 self.brand_temp = JHSBrandTEMP() # 获取Json数据 self.jsonpage = Jsonpage() # 首页的品牌团列表 self.home_brands = {} # 品牌团页面的最上面推广位 self.top_brands = {} # 页面信息 self.ju_home_page = '' # 聚划算首页 self.ju_brand_page = '' # 聚划算品牌团页面 # 抓取开始时间 self.begin_time = Common.now()
def __init__(self, m_type): # 抓取设置 self.crawler = RetryCrawler() # DB self.mysqlAccess = MysqlAccess() # mysql access # cat queue self.cat_queue = JHSQ('cat','main') # act queue self.act_queue = JHSQ('act','main') self.work = JHSWorker() # 默认类别 #self.category_list = [("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000")] self.category_list = [ ("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000"), ("http://ju.taobao.com/jusp/nanzhuangpindao/tp.htm#J_FixedNav","男装","7000"), ("http://ju.taobao.com/jusp/xiebaopindao/tp.htm#J_FixedNav","鞋包","3000"), ("http://ju.taobao.com/jusp/neiyipindao/tp.htm#J_FixedNav","内衣","4000"), ("http://ju.taobao.com/jusp/zhubaoshipin/tp.htm#J_FixedNav","饰品","42000"), ("http://ju.taobao.com/jusp/yundongpindao/tp.htm#J_FixedNav","运动","38000"), ("http://ju.taobao.com/jusp/meizhuangpindao/tp.htm#J_FixedNav","美妆","2000"), ("http://ju.taobao.com/jusp/tongzhuangpindao/tp.htm#J_FixedNav","童装","23000"), ("http://ju.taobao.com/jusp/shipinpindao/tp.htm#J_FixedNav","零食","5000"), ("http://ju.taobao.com/jusp/muyingpindao/tp.htm#J_FixedNav","母婴","6000"), ("http://ju.taobao.com/jusp/baihuopindao/tp.htm#J_FixedNav","百货","37000"), ("http://ju.taobao.com/jusp/chepinpindao/tp.htm#J_FixedNav","汽车","36000"), ("http://ju.taobao.com/jusp/jiadianpindao/tp.htm#J_FixedNav","家电","34000"), ("http://ju.taobao.com/jusp/shumapindao/tp.htm#J_FixedNav","数码","43000"), ("http://ju.taobao.com/jusp/jiajunewpindao/tp.htm#J_FixedNav","家装","225000"), ("http://ju.taobao.com/jusp/jiajupindao/tp.htm#J_FixedNav","家纺","35000") ] # 页面 self.site_page = None # 抓取开始时间 self.begin_time = Common.now() # 分布式主机标志 self.m_type = m_type
class JHSBrand(): '''A class of JHS category channel''' def __init__(self, m_type): # 抓取设置 self.crawler = RetryCrawler() # DB self.mysqlAccess = MysqlAccess() # mysql access # cat queue self.cat_queue = JHSQ('cat','main') # act queue self.act_queue = JHSQ('act','main') self.work = JHSWorker() # 默认类别 #self.category_list = [("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000")] self.category_list = [ ("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000"), ("http://ju.taobao.com/jusp/nanzhuangpindao/tp.htm#J_FixedNav","男装","7000"), ("http://ju.taobao.com/jusp/xiebaopindao/tp.htm#J_FixedNav","鞋包","3000"), ("http://ju.taobao.com/jusp/neiyipindao/tp.htm#J_FixedNav","内衣","4000"), ("http://ju.taobao.com/jusp/zhubaoshipin/tp.htm#J_FixedNav","饰品","42000"), ("http://ju.taobao.com/jusp/yundongpindao/tp.htm#J_FixedNav","运动","38000"), ("http://ju.taobao.com/jusp/meizhuangpindao/tp.htm#J_FixedNav","美妆","2000"), ("http://ju.taobao.com/jusp/tongzhuangpindao/tp.htm#J_FixedNav","童装","23000"), ("http://ju.taobao.com/jusp/shipinpindao/tp.htm#J_FixedNav","零食","5000"), ("http://ju.taobao.com/jusp/muyingpindao/tp.htm#J_FixedNav","母婴","6000"), ("http://ju.taobao.com/jusp/baihuopindao/tp.htm#J_FixedNav","百货","37000"), ("http://ju.taobao.com/jusp/chepinpindao/tp.htm#J_FixedNav","汽车","36000"), ("http://ju.taobao.com/jusp/jiadianpindao/tp.htm#J_FixedNav","家电","34000"), ("http://ju.taobao.com/jusp/shumapindao/tp.htm#J_FixedNav","数码","43000"), ("http://ju.taobao.com/jusp/jiajunewpindao/tp.htm#J_FixedNav","家装","225000"), ("http://ju.taobao.com/jusp/jiajupindao/tp.htm#J_FixedNav","家纺","35000") ] # 页面 self.site_page = None # 抓取开始时间 self.begin_time = Common.now() # 分布式主机标志 self.m_type = m_type def antPage(self): try: # 主机器需要配置redis队列 if self.m_type == 'm': category_list = self.mysqlAccess.selectJhsGroupItemCategory() if not category_list or len(category_list) == 0: category_list = self.category_list if category_list and len(category_list) > 0: cate_val_list = [] for cate in category_list: cate_val_list.append((cate[0],cate[2],cate[1],Config.ju_home_today,Config.JHS_GroupItem)) # 清空category redis队列 self.cat_queue.clearQ() # 保存category redis队列 self.cat_queue.putlistQ(cate_val_list) # 清空act redis队列 self.act_queue.clearQ() print '# category queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) else: print '# not find category...' # 类目的活动Json obj = 'cat' crawl_type = 'main' # 获取还没有开团的活动id val = (Common.time_s(Common.now()),) acts = self.mysqlAccess.selectJhsActNotStart(val) brandact_id_list = [] if acts: for act in acts: brandact_id_list.append(str(act[1])) _val = (self.begin_time, brandact_id_list) self.work.process(obj,crawl_type,_val) # 活动数据 act_val_list = self.work.items print '# act nums:', len(act_val_list) # 保存到redis队列 self.act_queue.putlistQ(act_val_list) print '# act queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if self.m_type == 'm': val = (Common.add_hours(self.begin_time, -2),Common.add_hours(self.begin_time, -2),Common.add_hours(self.begin_time, -1)) # 删除Redis中上个小时结束的活动 _acts = self.mysqlAccess.selectJhsActEndLastOneHour(val) print '# end acts num:',len(_acts) self.work.delAct(_acts) # 删除Redis中上个小时结束的商品 _items = self.mysqlAccess.selectJhsItemEndLastOneHour(val) print '# end items num:',len(_items) self.work.delItem(_items) except Exception as e: print '# antpage error :',e Common.traceback_log() # 商品团频道 def categoryListTEMP(self): page = self.crawler.getData(Config.ju_home_today, Config.ju_home) if not page or page == '': print '# not get today page' category_list = [] m = re.search(r'<div class="J_CatLeft layout-left">.+?<table>(.+?)</table>.+?</div>',page,flags=re.S) if m: category_list = self.categoryListType1(m.group(1)) else: m = re.search(r'<div class="catbg">\s+<div class="ju-wrapper">\s+<div class="cat-menu-h".+?>.+?<ul class="clearfix">(.+?)</ul>',page,flags=re.S) if m: category_list = self.categoryListType2(m.group(1)) return category_list def categoryListType1(self,page): category_list = [] m = re.search(r'<tr class="h2">.+?</tr>(.+?)<tr class="h2">',page,flags=re.S) if m: cate_list = m.group(1) p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S) for cate in p.finditer(cate_list): category_list.append((cate.group(1),cate.group(2).strip())) return category_list def categoryListType2(self,page): category_list = [] p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S) for cate in p.finditer(page): category_list.append((cate.group(1),cate.group(2).strip())) return category_list
class JHSActPosition(): '''A class of brand position''' def __init__(self): # mysql self.mysqlAccess = MysqlAccess() # 抓取设置 #self.crawler = TBCrawler() self.crawler = RetryCrawler() # 页面模板解析 self.brand_temp = JHSBrandTEMP() # 获取Json数据 self.jsonpage = Jsonpage() # 首页的品牌团列表 self.home_brands = {} # 品牌团页面的最上面推广位 self.top_brands = {} # 页面信息 self.ju_home_page = '' # 聚划算首页 self.ju_brand_page = '' # 聚划算品牌团页面 # 抓取开始时间 self.begin_time = Common.now() def antPage(self): try: # 获取首页的品牌团 page = self.crawler.getData(Config.ju_home, Config.tmall_home) hb = JHSHomeBrand() hb.antPage(page) if hb.home_brands == {} or not hb.home_brands: page = self.crawler.getData(Config.ju_home_today, Config.ju_home) hb.antPage(page) self.home_brands = hb.home_brands page_datepath = 'act/position/' + time.strftime("%Y/%m/%d/%H/", time.localtime(self.begin_time)) Config.writefile(page_datepath,'home.htm',page) #print '# home activities:', self.home_brands # 获取品牌团列表页数据 page = self.crawler.getData(Config.ju_brand_home, Config.ju_home) self.activityList(page) except Exception as e: print '# exception err in antPage info:',e Common.traceback_log() # 品牌团列表 def activityList(self, page): if not page or page == '': raise Common.InvalidPageException("# brand activityList: not get JHS brand home.") self.ju_brand_page = page # 保存html文件 page_datepath = 'act/marketing/' + time.strftime("%Y/%m/%d/%H/", time.localtime(self.begin_time)) Config.writefile(page_datepath,'brand.htm',self.ju_brand_page) # 数据接口URL list self.top_brands = self.brand_temp.activityTopbrandTemp(page) b_url_valList = self.brand_temp.activityListTemp(page) if b_url_valList != []: # 从接口中获取的数据列表 bResult_list = [] json_valList = [] for b_url_val in b_url_valList: b_url, f_name, f_catid = b_url_val json_valList.append((b_url,Config.ju_brand_home,(f_catid,f_name))) bResult_list = self.jsonpage.get_json(json_valList) act_valList = [] if bResult_list and bResult_list != []: a_val = (Config.JHS_Brand,'',self.begin_time,) act_valList = self.jsonpage.parser_brandjson(bResult_list,a_val) if act_valList != []: print '# get brand act num:',len(act_valList) self.run_brandAct(act_valList) else: print '# err: not get brandjson parser val list.' else: print '# err: not find activity json data URL list.' def run_brandAct(self, act_valList): repeatact_num = 0 # 活动数量 act_num = 0 # 需要保存活动sql列表 act_sql_list = [] # 用于活动去重id dict brandact_id_dict = {} print '# brand activities start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 多线程 控制并发的线程数 if len(act_valList) > Config.act_max_th: m_Obj = JHSActM(5, Config.act_max_th) else: m_Obj = JHSActM(5, len(act_valList)) m_Obj.putItems(act_valList) m_Obj.createthread() m_Obj.run() item_list = m_Obj.items for b in item_list: act_num += 1 brandact_id,brandact_name,brandact_url,brandact_sign,val = b if int(brandact_sign) == 3: continue # 去重 if brandact_id_dict.has_key(str(brandact_id)): repeatact_num += 1 print '# repeat brand act. activity id:%s name:%s'%(brandact_id, brandact_name) else: brandact_id_dict[str(brandact_id)] = brandact_name if self.home_brands.has_key(str(brandact_id)): val = val + (self.home_brands[str(brandact_id)]["position"],self.home_brands[str(brandact_id)]["datatype"],self.home_brands[str(brandact_id)]["typename"]) elif self.home_brands.has_key(brandact_url): val = val + (self.home_brands[brandact_url]["position"],self.home_brands[brandact_url]["datatype"],self.home_brands[brandact_url]["typename"]) else: val = val + (None,None,None) if self.top_brands.has_key(str(brandact_id)): val = val + (self.top_brands[str(brandact_id)]["position"],self.top_brands[str(brandact_id)]["datatype"]) elif self.top_brands.has_key(brandact_url): val = val + (self.top_brands[brandact_url]["position"],self.top_brands[brandact_url]["datatype"]) else: val = val + (None,None) act_sql_list.append(val) print '# brand activities end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 品牌团活动位置信息入库 # 保存 actsql_list = [] for sql in act_sql_list: actsql_list.append(sql) if len(actsql_list) >= Config.act_max_arg: self.mysqlAccess.insertJhsActPosition(actsql_list) actsql_list = [] if len(actsql_list) > 0: self.mysqlAccess.insertJhsActPosition(actsql_list) print '# Find act num:', act_num print '# Repeat brand activity num:', repeatact_num
class Channel(): '''A class of XC channel''' def __init__(self): # 抓取设置 self.crawler = XCCrawler() self.retrycrawler = RetryCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_time_s = Common.time_s(self.crawling_time) self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 频道信息 self.platform = '携程-pc' # 品牌团所在平台 self.channel_id = '' # 频道id self.channel_url = '' # 频道链接 self.channel_name = '' # 频道name self.channel_type = '' # 频道类型 # 频道所属地理位置信息 self.province_id = 0 # 省,州id self.province_name = '' # 省,州名称 # 原数据信息 self.channel_page = '' # 频道页面html内容 self.channel_pages = {} # 频道页面内请求数据列表 # channel items self.channel_items = [] # channel list self.channel_list = [] # 频道页初始化 def init(self, channel_id, channel_url, channel_type, begin_time): self.channel_id = channel_id self.channel_url = channel_url self.channel_type = channel_type self.crawling_begintime = begin_time self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime)) self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime)) def config(self): self.channelPage() if self.channel_type == 1: self.spot() #elif self.channel_type == 2: else: Common.log('# not find this channel type...') def spot(self): if self.channel_page: m = re.search(r'<div class="cate_select">(.+?)</div>', self.channel_page, flags=re.S) if m: cate_select = m.group(1) c_list = [] p = re.compile(r'<a.+?class="select">(.+?)</a>', flags=re.S) for c in p.finditer(cate_select): c_list.append(re.sub(r'<.+?>', '', c.group(1)).strip()) self.channel_name = '-'.join(c_list) i_p = 1 i_page = 1 m_page = 1 page_main = '' m = re.search(r'<div id="base_bd">.+?<div class="bg_miancolor">.+?<div class="vacation_bd">(.+?)<div class="vacation_bd bottom_seo">', self.channel_page, flags=re.S) if m: page_main = m.group(1) else: page_main = self.channel_page Common.log(i_page) i_p = self.get_items(page_main, i_p) m = re.search(r'<span class="c_page2_numtop">(.+?)</span>', self.channel_page, flags=re.S) if m: m_page_info = m.group(1) m = re.search(r'\d+/(\d+)', m_page_info, flags=re.S) if m: m_page = int(m.group(1)) page_url = self.channel_url[0:-1] + 'P%s/' while i_page < m_page: i_page += 1 p_url = page_url % str(i_page) Common.log(i_page) page = self.retrycrawler.getData(p_url, self.channel_url) i_p = self.get_items(page, i_p) def get_items(self, page_main, i_p): if page_main: p = re.compile(r'<div class="searchresult_product04">\s+<div class="search_ticket_caption basefix">\s+<a href="(.+?)".+?>\s+<img src="(.+?)".+?/>.+?<div class="search_ticket_title">\s+<h2>\s+<a.+?>(.+?)</a>.+?</h2>\s+<div class="adress">(.+?)</div>\s+<div class="exercise">(.*?)</div>', flags=re.S) for info in p.finditer(page_main): if int(self.channel_type) == 1: i_url = Config.xc_piao_home + info.group(1) else: i_url = Config.xc_home + info.group(1) i_img, i_name, i_area, i_desc = info.group(2), info.group(3).strip(), info.group(4).strip(), info.group(5).strip() i_book = 1 i_id = 0 if i_url != '': m = re.search(r't(\d+).html', i_url) if m: i_id = m.group(1) val = (self.channel_id, self.channel_name, self.channel_url, self.channel_type, (i_book, i_id, i_url, i_img, i_name, i_desc, i_area, i_p, self.crawling_begintime)) self.channel_items.append(val) i_p += 1 return i_p def channelList(self): self.channelPage() if self.channel_page: m = re.search(r'<ul class="search_cate">\s+<li class="cate_content.+?">\s+<span class="b">.+?<span class="area_box">(.+?)</span>', self.channel_page, flags=re.S) if m: area_infos = m.group(1) p = re.compile(r'<a href="(.+?)".+?>(.+?)</a>', flags=re.S) for area in p.finditer(area_infos): channel_url, c_name = Config.xc_piao_home + area.group(1), area.group(2) channel_id = 0 if channel_url: m = re.search(r'D(\d+)', channel_url) if m: channel_id = m.group(1) if c_name: m = re.search(r'(.+?)\(', c_name, flags=re.S) if m: channel_name = m.group(1).strip() else: channel_name = c_name.strip() if int(channel_id) != 0 and channel_url: self.channel_list.append((channel_id, channel_name, channel_url, str(self.channel_type), str(self.province_id), self.province_name)) def channelPage(self): if self.channel_url: refers = Config.xc_home if int(self.channel_type) == 1: refers = Config.xc_piao_home data = self.crawler.getData(self.channel_url, Config.xc_home) if not data and data == '': raise Common.InvalidPageException("# channelPage:not find channel page,channel_id:%s,channel_url:%s"%(str(self.channel_id), self.channel_url)) if data and data != '': self.channel_page = data self.channel_pages['channel-home'] = (self.channel_url, data) def antPage(self, val): channel_id, channel_url, channel_type, begin_time = val self.init(channel_id, channel_url, channel_type, begin_time) self.config() def antChannelList(self, val): self.channel_url, self.channel_type, self.province_id, self.province_name = val self.channelList()
class Channel(): '''A class of XC channel''' def __init__(self): # 抓取设置 self.crawler = XCCrawler() self.retrycrawler = RetryCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_time_s = Common.time_s(self.crawling_time) self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 频道信息 self.platform = '携程-pc' # 品牌团所在平台 self.channel_id = '' # 频道id self.channel_url = '' # 频道链接 self.channel_name = '' # 频道name self.channel_type = '' # 频道类型 # 频道所属地理位置信息 self.province_id = 0 # 省,州id self.province_name = '' # 省,州名称 # 原数据信息 self.channel_page = '' # 频道页面html内容 self.channel_pages = {} # 频道页面内请求数据列表 # channel items self.channel_items = [] # channel list self.channel_list = [] # 频道页初始化 def init(self, channel_id, channel_url, channel_type, begin_time): self.channel_id = channel_id self.channel_url = channel_url self.channel_type = channel_type self.crawling_begintime = begin_time self.crawling_beginDate = time.strftime( "%Y-%m-%d", time.localtime(self.crawling_begintime)) self.crawling_beginHour = time.strftime( "%H", time.localtime(self.crawling_begintime)) def config(self): self.channelPage() if self.channel_type == 1: self.spot() #elif self.channel_type == 2: else: Common.log('# not find this channel type...') def spot(self): if self.channel_page: m = re.search(r'<div class="cate_select">(.+?)</div>', self.channel_page, flags=re.S) if m: cate_select = m.group(1) c_list = [] p = re.compile(r'<a.+?class="select">(.+?)</a>', flags=re.S) for c in p.finditer(cate_select): c_list.append(re.sub(r'<.+?>', '', c.group(1)).strip()) self.channel_name = '-'.join(c_list) i_p = 1 i_page = 1 m_page = 1 page_main = '' m = re.search( r'<div id="base_bd">.+?<div class="bg_miancolor">.+?<div class="vacation_bd">(.+?)<div class="vacation_bd bottom_seo">', self.channel_page, flags=re.S) if m: page_main = m.group(1) else: page_main = self.channel_page Common.log(i_page) i_p = self.get_items(page_main, i_p) m = re.search(r'<span class="c_page2_numtop">(.+?)</span>', self.channel_page, flags=re.S) if m: m_page_info = m.group(1) m = re.search(r'\d+/(\d+)', m_page_info, flags=re.S) if m: m_page = int(m.group(1)) page_url = self.channel_url[0:-1] + 'P%s/' while i_page < m_page: i_page += 1 p_url = page_url % str(i_page) Common.log(i_page) page = self.retrycrawler.getData(p_url, self.channel_url) i_p = self.get_items(page, i_p) def get_items(self, page_main, i_p): if page_main: p = re.compile( r'<div class="searchresult_product04">\s+<div class="search_ticket_caption basefix">\s+<a href="(.+?)".+?>\s+<img src="(.+?)".+?/>.+?<div class="search_ticket_title">\s+<h2>\s+<a.+?>(.+?)</a>.+?</h2>\s+<div class="adress">(.+?)</div>\s+<div class="exercise">(.*?)</div>', flags=re.S) for info in p.finditer(page_main): if int(self.channel_type) == 1: i_url = Config.xc_piao_home + info.group(1) else: i_url = Config.xc_home + info.group(1) i_img, i_name, i_area, i_desc = info.group(2), info.group( 3).strip(), info.group(4).strip(), info.group(5).strip() i_book = 1 i_id = 0 if i_url != '': m = re.search(r't(\d+).html', i_url) if m: i_id = m.group(1) val = (self.channel_id, self.channel_name, self.channel_url, self.channel_type, (i_book, i_id, i_url, i_img, i_name, i_desc, i_area, i_p, self.crawling_begintime)) self.channel_items.append(val) i_p += 1 return i_p def channelList(self): self.channelPage() if self.channel_page: m = re.search( r'<ul class="search_cate">\s+<li class="cate_content.+?">\s+<span class="b">.+?<span class="area_box">(.+?)</span>', self.channel_page, flags=re.S) if m: area_infos = m.group(1) p = re.compile(r'<a href="(.+?)".+?>(.+?)</a>', flags=re.S) for area in p.finditer(area_infos): channel_url, c_name = Config.xc_piao_home + area.group( 1), area.group(2) channel_id = 0 if channel_url: m = re.search(r'D(\d+)', channel_url) if m: channel_id = m.group(1) if c_name: m = re.search(r'(.+?)\(', c_name, flags=re.S) if m: channel_name = m.group(1).strip() else: channel_name = c_name.strip() if int(channel_id) != 0 and channel_url: self.channel_list.append( (channel_id, channel_name, channel_url, str(self.channel_type), str(self.province_id), self.province_name)) def channelPage(self): if self.channel_url: refers = Config.xc_home if int(self.channel_type) == 1: refers = Config.xc_piao_home data = self.crawler.getData(self.channel_url, Config.xc_home) if not data and data == '': raise Common.InvalidPageException( "# channelPage:not find channel page,channel_id:%s,channel_url:%s" % (str(self.channel_id), self.channel_url)) if data and data != '': self.channel_page = data self.channel_pages['channel-home'] = (self.channel_url, data) def antPage(self, val): channel_id, channel_url, channel_type, begin_time = val self.init(channel_id, channel_url, channel_type, begin_time) self.config() def antChannelList(self, val): self.channel_url, self.channel_type, self.province_id, self.province_name = val self.channelList()
class JHSGroupItem(): '''A class of JHS group item channel''' def __init__(self, m_type): # 分布式主机标志 self.m_type = m_type # 抓取设置 self.crawler = RetryCrawler() # cat queue self.cat_queue = JHSQ('groupitemcat', 'main') self.worker = JHSGroupItemWorker() # 默认类别 #self.category_list = [("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000")] self.category_list = [ ("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000"), ("http://ju.taobao.com/jusp/nanzhuangpindao/tp.htm#J_FixedNav","男装","7000"), ("http://ju.taobao.com/jusp/xiebaopindao/tp.htm#J_FixedNav","鞋包","3000"), ("http://ju.taobao.com/jusp/neiyipindao/tp.htm#J_FixedNav","内衣","4000"), ("http://ju.taobao.com/jusp/zhubaoshipin/tp.htm#J_FixedNav","饰品","42000"), ("http://ju.taobao.com/jusp/yundongpindao/tp.htm#J_FixedNav","运动","38000"), ("http://ju.taobao.com/jusp/meizhuangpindao/tp.htm#J_FixedNav","美妆","2000"), ("http://ju.taobao.com/jusp/tongzhuangpindao/tp.htm#J_FixedNav","童装","23000"), ("http://ju.taobao.com/jusp/shipinpindao/tp.htm#J_FixedNav","零食","5000"), ("http://ju.taobao.com/jusp/muyingpindao/tp.htm#J_FixedNav","母婴","6000"), ("http://ju.taobao.com/jusp/baihuopindao/tp.htm#J_FixedNav","百货","37000"), ("http://ju.taobao.com/jusp/chepinpindao/tp.htm#J_FixedNav","汽车","36000"), ("http://ju.taobao.com/jusp/jiadianpindao/tp.htm#J_FixedNav","家电","34000"), ("http://ju.taobao.com/jusp/shumapindao/tp.htm#J_FixedNav","数码","43000"), ("http://ju.taobao.com/jusp/jiajunewpindao/tp.htm#J_FixedNav","家装","225000"), ("http://ju.taobao.com/jusp/jiajupindao/tp.htm#J_FixedNav","家纺","35000") ] # 页面 self.site_page = None # 抓取开始时间 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() def antPage(self): try: # 主机器需要配置redis队列 if self.m_type == 'm': category_list = self.worker.scanCategories() if not category_list or len(category_list) == 0: category_list = self.category_list if category_list and len(category_list) > 0: # 清空category redis队列 self.cat_queue.clearQ() # 保存category redis队列 self.cat_queue.putlistQ(category_list) print '# groupitem category queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) else: print '# groupitem not find category...' obj = 'groupitemcat' crawl_type = 'main' self.worker.process(obj, crawl_type, Config.ju_home_today) items = self.worker.items print '# all parser items num:',len(items) # 查找新上商品 self.get_newitems(items) if self.m_type == 'm': # 删除Redis中结束商品 #self.worker.scanEndItems() self.worker.scanEndItemsLasthour() except Exception as e: print '# antpage error :',e Common.traceback_log() # 查找新上商品,并抓取新上商品详情 def get_newitems(self, items): result_items = [] for item in items: item_status, item_val, o_val = item item_juid = item_val[1] result_items.append({"item_juId":str(item_juid),"val":o_val,"r_val":item_val}) new_item_list = self.worker.selectNewItems(result_items) print '# new items num:',len(new_item_list) # 抓取新上商品 itemcrawl_type = 'new' # 附加信息 a_val = (self.begin_time,) items = self.crawlNewItems(new_item_list,itemcrawl_type,a_val) # 保存新上商品信息到Redis new_items = [] for item in items: iteminfoSql = item item_juid = iteminfoSql[1] new_items.append({"item_juId":item_juid,"r_val":iteminfoSql}) self.worker.putItemDB(new_items) # 抓取新上的商品详情 def crawlNewItems(self, _new_items, itemcrawl_type, a_val): print '# crawl Group Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 多线程 控制并发的线程数 max_th = Config.item_max_th if len(_new_items) > max_th: m_itemsObj = JHSGroupItemCrawlerM(itemcrawl_type, max_th, a_val) else: m_itemsObj = JHSGroupItemCrawlerM(itemcrawl_type, len(_new_items), a_val) m_itemsObj.createthread() m_itemsObj.putItems(_new_items) m_itemsObj.run() _items = m_itemsObj.items print '# insert new item num:',len(_items) print '# crawl Group Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) return _items # 商品团频道 def categoryListTEMP(self): page = self.crawler.getData(Config.ju_home_today, Config.ju_home) if not page or page == '': print '# not get today page' category_list = [] m = re.search(r'<div class="J_CatLeft layout-left">.+?<table>(.+?)</table>.+?</div>',page,flags=re.S) if m: category_list = self.categoryListType1(m.group(1)) else: m = re.search(r'<div class="catbg">\s+<div class="ju-wrapper">\s+<div class="cat-menu-h".+?>.+?<ul class="clearfix">(.+?)</ul>',page,flags=re.S) if m: category_list = self.categoryListType2(m.group(1)) return category_list def categoryListType1(self,page): category_list = [] m = re.search(r'<tr class="h2">.+?</tr>(.+?)<tr class="h2">',page,flags=re.S) if m: cate_list = m.group(1) p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S) for cate in p.finditer(cate_list): category_list.append((cate.group(1),cate.group(2).strip())) return category_list def categoryListType2(self,page): category_list = [] p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S) for cate in p.finditer(page): category_list.append((cate.group(1),cate.group(2).strip())) return category_list def getAjaxurlList(self,page_val,refers): url_list = [] page, c_name, c_id = page_val p = re.compile(r'<.+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S) i = 0 for a_info in p.finditer(page): c_subNav = c_name a_url = a_info.group(1).replace('amp;','') info = a_info.group(2) m = re.search(r'<span class="l-f-tbox">(.+?)</span>',info,flags=re.S) if m: c_subNav = m.group(1).strip() a_val = (c_id,c_name,refers,c_subNav) url_list.append((a_url,refers,a_val)) i += 1 return url_list
class Channel(): '''A class of TC channel''' def __init__(self): # 抓取设置 self.crawler = TCCrawler() self.retrycrawler = RetryCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_time_s = Common.time_s(self.crawling_time) self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 频道信息 self.platform = '同程-pc' # 品牌团所在平台 self.channel_id = '' # 频道id self.channel_url = '' # 频道链接 self.channel_name = '' # 频道name self.channel_type = '' # 频道类型 # 原数据信息 self.channel_page = '' # 频道页面html内容 self.channel_pages = {} # 频道页面内请求数据列表 # channel items self.channel_items = [] # channel list self.channel_list = [] # 频道页初始化 def init(self, channel_id, channel_url, channel_type, begin_time): self.channel_id = channel_id self.channel_url = channel_url self.channel_type = channel_type self.crawling_begintime = begin_time self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime)) self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime)) def config(self): self.channelPage() if self.channel_type == 1: self.spot() #elif self.channel_type == 2: else: Common.log('# not find this channel type...') def spot(self): if self.channel_page: m = re.search(r'<title>(.+?)</title>', self.channel_page, flags=re.S) if m: self.channel_name = m.group(1) keyword, pid, cid, cyid = '', 0, 0, 0 m = re.search(r'<span id="hdKeyWord">(.*?)</span>', self.channel_page, flags=re.S) if m: keyword = m.group(1) m = re.search(r'<span id="hdPid">(.*?)</span>', self.channel_page, flags=re.S) if m: pid = int(m.group(1)) m = re.search(r'<span id="hdCid">(.*?)</span>', self.channel_page, flags=re.S) if m: cid = int(m.group(1)) m = re.search(r'<span id="hdCyid">(.*?)</span>', self.channel_page, flags=re.S) if m: cyid = int(m.group(1)) i_p = 1 i_page = 1 m_page = 1 page_main = '' m = re.search(r'<div class="scenery_main" id="sceneryListInfo">(.+?)<div id="pageNum_box" class="s_pager none">', self.channel_page, flags=re.S) if m: page_main = m.group(1) else: page_main = self.channel_page Common.log(i_page) i_p = self.get_items(page_main, i_p) m = re.search(r'<input type="hidden" id="txt_AllpageNumber" value="(.+?)">', page_main, flags=re.S) if m: m_page = int(m.group(1)) page_url = 'http://www.ly.com/scenery/SearchList.aspx?&action=getlist&page=%d&kw=&pid=%d&cid=%d&cyid=%d&theme=0&grade=0&money=0&sort=0&paytype=0&ismem=0&istuan=0&isnow=0&spType=&isyiyuan=0&lbtypes=&IsNJL=0&classify=0' while i_page < m_page: i_page += 1 p_url = page_url % (i_page, pid, cid, cyid) Common.log(i_page) page = self.retrycrawler.getData(p_url, self.channel_url) i_p = self.get_items(page, i_p) def get_items(self, page_main, i_p): if page_main: p = re.compile(r'<div class="scenery_list(.+?)">\s*<div class="s_info"><div class="img_con"><a class="a_img".+?href="(.+?)"><img.+?src="(.+?)".*?></a></div><div class="info_con"><dl class="info_top"><dt><a class="fir_name".+?>(.+?)</a>.+?<span class="s_level">(.*?)</span>.+?<dd class="scenery_area"><span>(.+?)</span>.+?</dl></div></div>', flags=re.S) for info in p.finditer(page_main): all_info, i_info, i_url, i_img, i_name, i_level, i_area = info.group(), info.group(1), (Config.tc_home + info.group(2)), info.group(3), info.group(4), re.sub(r'<.+?>', '', info.group(5)), ' '.join(info.group(6).split()) i_book = 1 i_desc = '' m = re.search(r'<dd class="scenery_desc"><p>(.+?)</p>', all_info, flags=re.S) if m: i_desc = m.group(1) if i_info.find('nobook') != -1: i_book = 0 if i_desc == '': m = re.search(r'<dd class="scenery_state">(.+?)<a', all_info, flags=re.S) if m: i_desc = m.group(1) i_id = 0 if i_url != '': m = re.search(r'BookSceneryTicket_(\d+).html', i_url) if m: i_id = m.group(1) val = (self.channel_id, self.channel_name, self.channel_url, self.channel_type, (i_book, i_id, i_url, i_img, i_name, i_desc, i_level, i_area, i_p, self.crawling_begintime)) self.channel_items.append(val) #if i_p == 1: Common.log(val) i_p += 1 return i_p def channelList(self): self.channelPage() if self.channel_page: city_list = self.moreCity(self.channel_page, 'city') for city in city_list: city_url, city_name, province_id, city_id, dcity_id = city if city_url: province_name, city_name = '', '' city_page = self.crawler.getData(city_url, self.channel_url) if city_page: m = re.search(r'<div class="search_screen_dl"><dl action="province">.+?<div class="right"><a href=".+?" class="current" tvalue="(\d+)" title="(.+?)">', city_page, flags=re.S) if m: province_id, province_name = m.group(1), m.group(2) m = re.search(r'<div class="search_screen_dl">.+?<dl action="city">.+?<div class="right"><a href=".+?" class="current" tvalue="(\d+)" title="(.+?)">', city_page, flags=re.S) if m: city_id, city_name = m.group(1), m.group(2) channel_list = self.moreCity(city_page, 'district') if channel_list and len(channel_list) > 0: for channel in channel_list: channel_url, channel_name, p_id, c_id, cy_id = channel self.channel_list.append((str(cy_id), channel_name, channel_url, str(self.channel_type), city_id, city_name, province_id, province_name)) def moreCity(self, page, action_key): city_list = [] if page: p_id, c_id, cy_id = 0, 0, 0 m = re.search(r'<span id="hdPid">(.*?)</span>', page, flags=re.S) if m: p_id = int(m.group(1)) m = re.search(r'<span id="hdCid">(.*?)</span>', page, flags=re.S) if m: c_id = int(m.group(1)) m = re.search(r'<span id="hdCyid">(.*?)</span>', page, flags=re.S) if m: cy_id = int(m.group(1)) p_url = 'http://www.ly.com/scenery/scenerysearchlist_%d_%d__0_0_0_%d_0_0_0.html' m = re.search(r'<div class="search_screen_box" id="searchScreenBox">.+?<dl action="%s".+?>.+?<dd>.+?<div class="right">(.+?)</div></dd>' % action_key, page, flags=re.S) if m: city_infos = m.group(1) p = re.compile(r'<a.+?tvalue="(\d+)" title="(.+?)">', flags=re.S) for city in p.finditer(city_infos): city_id, city_name = int(city.group(1)), city.group(2) if action_key == 'city': c_id = city_id elif action_key == 'district': cy_id = city_id city_url, city_name = p_url % (p_id, c_id, cy_id), city.group(2) city_list.append((city_url, city_name, p_id, c_id, cy_id)) return city_list def channelPage(self): if self.channel_url: data = self.crawler.getData(self.channel_url, Config.tc_home) if not data and data == '': raise Common.InvalidPageException("# channelPage:not find channel page,channel_id:%s,channel_url:%s"%(str(self.channel_id), self.channel_url)) if data and data != '': self.channel_page = data self.channel_pages['channel-home'] = (self.channel_url, data) def antPage(self, val): channel_id, channel_url, channel_type, begin_time = val self.init(channel_id, channel_url, channel_type, begin_time) self.config() def antChannelList(self, val): self.channel_url, self.channel_type = val self.channelList()