class JHSActPosition(): '''A class of brand position''' def __init__(self): # mysql self.mysqlAccess = MysqlAccess() # 抓取设置 #self.crawler = TBCrawler() self.crawler = RetryCrawler() # 页面模板解析 self.brand_temp = JHSBrandTEMP() # 获取Json数据 self.jsonpage = Jsonpage() # 首页的品牌团列表 self.home_brands = {} # 品牌团页面的最上面推广位 self.top_brands = {} # 页面信息 self.ju_home_page = '' # 聚划算首页 self.ju_brand_page = '' # 聚划算品牌团页面 # 抓取开始时间 self.begin_time = Common.now() def antPage(self): try: # 获取首页的品牌团 page = self.crawler.getData(Config.ju_home, Config.tmall_home) hb = JHSHomeBrand() hb.antPage(page) if hb.home_brands == {} or not hb.home_brands: page = self.crawler.getData(Config.ju_home_today, Config.ju_home) hb.antPage(page) self.home_brands = hb.home_brands page_datepath = 'act/position/' + time.strftime("%Y/%m/%d/%H/", time.localtime(self.begin_time)) Config.writefile(page_datepath,'home.htm',page) #print '# home activities:', self.home_brands # 获取品牌团列表页数据 page = self.crawler.getData(Config.ju_brand_home, Config.ju_home) self.activityList(page) except Exception as e: print '# exception err in antPage info:',e Common.traceback_log() # 品牌团列表 def activityList(self, page): if not page or page == '': raise Common.InvalidPageException("# brand activityList: not get JHS brand home.") self.ju_brand_page = page # 保存html文件 page_datepath = 'act/marketing/' + time.strftime("%Y/%m/%d/%H/", time.localtime(self.begin_time)) Config.writefile(page_datepath,'brand.htm',self.ju_brand_page) # 数据接口URL list self.top_brands = self.brand_temp.activityTopbrandTemp(page) b_url_valList = self.brand_temp.activityListTemp(page) if b_url_valList != []: # 从接口中获取的数据列表 bResult_list = [] json_valList = [] for b_url_val in b_url_valList: b_url, f_name, f_catid = b_url_val json_valList.append((b_url,Config.ju_brand_home,(f_catid,f_name))) bResult_list = self.jsonpage.get_json(json_valList) act_valList = [] if bResult_list and bResult_list != []: a_val = (Config.JHS_Brand,'',self.begin_time,) act_valList = self.jsonpage.parser_brandjson(bResult_list,a_val) if act_valList != []: print '# get brand act num:',len(act_valList) self.run_brandAct(act_valList) else: print '# err: not get brandjson parser val list.' else: print '# err: not find activity json data URL list.' def run_brandAct(self, act_valList): repeatact_num = 0 # 活动数量 act_num = 0 # 需要保存活动sql列表 act_sql_list = [] # 用于活动去重id dict brandact_id_dict = {} print '# brand activities start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 多线程 控制并发的线程数 if len(act_valList) > Config.act_max_th: m_Obj = JHSActM(5, Config.act_max_th) else: m_Obj = JHSActM(5, len(act_valList)) m_Obj.putItems(act_valList) m_Obj.createthread() m_Obj.run() item_list = m_Obj.items for b in item_list: act_num += 1 brandact_id,brandact_name,brandact_url,brandact_sign,val = b if int(brandact_sign) == 3: continue # 去重 if brandact_id_dict.has_key(str(brandact_id)): repeatact_num += 1 print '# repeat brand act. activity id:%s name:%s'%(brandact_id, brandact_name) else: brandact_id_dict[str(brandact_id)] = brandact_name if self.home_brands.has_key(str(brandact_id)): val = val + (self.home_brands[str(brandact_id)]["position"],self.home_brands[str(brandact_id)]["datatype"],self.home_brands[str(brandact_id)]["typename"]) elif self.home_brands.has_key(brandact_url): val = val + (self.home_brands[brandact_url]["position"],self.home_brands[brandact_url]["datatype"],self.home_brands[brandact_url]["typename"]) else: val = val + (None,None,None) if self.top_brands.has_key(str(brandact_id)): val = val + (self.top_brands[str(brandact_id)]["position"],self.top_brands[str(brandact_id)]["datatype"]) elif self.top_brands.has_key(brandact_url): val = val + (self.top_brands[brandact_url]["position"],self.top_brands[brandact_url]["datatype"]) else: val = val + (None,None) act_sql_list.append(val) print '# brand activities end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 品牌团活动位置信息入库 # 保存 actsql_list = [] for sql in act_sql_list: actsql_list.append(sql) if len(actsql_list) >= Config.act_max_arg: self.mysqlAccess.insertJhsActPosition(actsql_list) actsql_list = [] if len(actsql_list) > 0: self.mysqlAccess.insertJhsActPosition(actsql_list) print '# Find act num:', act_num print '# Repeat brand activity num:', repeatact_num