def __init__(self): # 抓取设置 self.crawler = TCCrawler() self.retrycrawler = RetryCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_time_s = Common.time_s(self.crawling_time) self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 频道信息 self.platform = '同程-pc' # 品牌团所在平台 self.channel_id = '' # 频道id self.channel_url = '' # 频道链接 self.channel_name = '' # 频道name self.channel_type = '' # 频道类型 # 原数据信息 self.channel_page = '' # 频道页面html内容 self.channel_pages = {} # 频道页面内请求数据列表 # channel items self.channel_items = [] # channel list self.channel_list = []
def __init__(self): # 抓取设置 self.crawler = TCCrawler() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' # wait time self.w_time = 1
def __init__(self): # 商品页面抓取设置 self.crawler = TCCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 单品类型商品所属频道 self.channel_id = '' self.channel_name = '' self.channel_url = '' self.channel_type = '' self.item_position = 0 # 商品信息 self.item_id = '' # 商品Id self.item_url = '' # 商品链接 self.item_pic_url = '' # 商品展示图片链接 self.item_name = '' # 商品Name self.item_desc = '' # 商品说明 self.item_book_status = 1 # 商品是否售卖 0:不售,1:在售 self.item_level = '' # 级别 self.item_area = '' # 地址 self.item_service = '' # 服务 self.item_comment = '' # 评论数 self.item_comment_rate = '' # 好评率 self.item_comment_good = '' # 好评数 # 商品交易 self.item_oriprice = '' # 商品原价 self.item_disprice = '' # 商品折扣价 self.item_discount = '' # 商品打折 # 门票 self.item_tickets = [] # 原数据信息 self.item_pageData = '' # 商品所属数据项内容 self.item_page = '' # 商品页面html内容 self.item_pages = {} # 商品页面内请求数据列表
def __init__(self): # tc spot type self.worker_type = Config.TC_Spot # DB self.tc_type = Config.TC_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # 抓取设置 self.crawler = TCCrawler() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s()
class Item(): '''A class of tc Item''' def __init__(self): # 商品页面抓取设置 self.crawler = TCCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 单品类型商品所属频道 self.channel_id = '' self.channel_name = '' self.channel_url = '' self.channel_type = '' self.item_position = 0 # 商品信息 self.item_id = '' # 商品Id self.item_url = '' # 商品链接 self.item_pic_url = '' # 商品展示图片链接 self.item_name = '' # 商品Name self.item_desc = '' # 商品说明 self.item_book_status = 1 # 商品是否售卖 0:不售,1:在售 self.item_level = '' # 级别 self.item_area = '' # 地址 self.item_service = '' # 服务 self.item_comment = '' # 评论数 self.item_comment_rate = '' # 好评率 self.item_comment_good = '' # 好评数 # 商品交易 self.item_oriprice = '' # 商品原价 self.item_disprice = '' # 商品折扣价 self.item_discount = '' # 商品打折 # 门票 self.item_tickets = [] # 原数据信息 self.item_pageData = '' # 商品所属数据项内容 self.item_page = '' # 商品页面html内容 self.item_pages = {} # 商品页面内请求数据列表 # 商品页信息 def spotConfig(self, _val): self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_level, self.item_area, self.item_position, self.crawling_begintime = _val # 本次抓取开始日期 self.crawling_beginDate = time.strftime( "%Y-%m-%d", time.localtime(self.crawling_begintime)) # 本次抓取开始小时 self.crawling_beginHour = time.strftime( "%H", time.localtime(self.crawling_begintime)) if self.item_book_status == 1: # 商品页信息 self.itemPage() page = self.item_page self.item_pages['item-home'] = (self.item_url, self.item_page) m = re.search( r'<div class="s-ppp">\s*<div class="s-tp">(.+?)</div>', page, flags=re.S) if m: i_disprice = re.sub(r'<.+?>', '', m.group(1)) m = re.search(r'(\d+)', i_disprice) if m: self.item_disprice = m.group(1) else: self.item_disprice = Common.htmlDecode(i_disprice) m = re.search( r'<div class="s-ppp">.+?<div class="s-mp"><s>(.+?)</s>\s+<span>(.+?)</span>', page, flags=re.S) if m: i_oriprice, i_discount = m.group(1), m.group(2) m = re.search(r'(\d+)', i_oriprice) if m: self.item_oriprice = m.group(1) else: self.item_oriprice = Common.htmlDecode(i_oriprice) m = re.search(r'(\d+\.\d+)', i_discount) if m: self.item_discount = m.group(1) else: m = re.search(r'(\d+)', i_discount) if m: self.item_discount = m.group(1) else: self.item_discount = i_discount m = re.search(r'<div class="s-sr">\s+<span.+?>(.+?)</div>', page, flags=re.S) if m: self.item_service = re.sub(r'<.+?>', '', ';'.join(m.group(1).split())) self.itemComment() self.itemTicket() def itemComment(self): c_url = 'http://tctj.ly.com/jrec/wlfrec?_dAjax=callback&cid=105&userId=1001&rid=%d&projectId=3&type=1&flag=2&pageSize=10&page=1&callback=tc%s' % ( int(self.item_id), Common.rand_n(11)) #c_url = 'http://tctj.ly.com/jrec/wlfrec?cid=105&userId=1001&rid=%d&projectId=3&type=0&flag=1&pageSize=10&page=1&_dAjax=callback&callback=tc%s' % (int(self.item_id), Common.rand_n(11)) c_page = self.crawler.getData(c_url, self.item_url) if c_page: m = re.search(r'"count":.+?"all":"(\d+)"', c_page, flags=re.S) if m: self.item_comment = m.group(1) m = re.search(r'"count":.+?"satisfaction":"(.+?)"', c_page, flags=re.S) if m: self.item_comment_rate = m.group(1) m = re.search(r'"count":.+?"good":"(\d+)"', c_page, flags=re.S) if m: self.item_comment_good = m.group(1) def itemTicket(self): t_url = 'http://www.ly.com/scenery/AjaxHelper/SceneryPriceFrame.aspx?action=GETNEWPRICEFRAMEFORLAST&ids=%d&isSimple=1&isShowAppThree=0&widthtype=1&isGrap=1&nobookid=&isyry=1&YpState=1&lon=null&lat=null' % int( self.item_id) result = self.crawler.getData(t_url, self.item_url) if result: try: scenery = json.loads(result) if scenery.has_key('SceneryPrices'): scenery_list = scenery['SceneryPrices'] for destination in scenery_list: if destination.has_key('DestinationId') and int( destination['DestinationId']) == int( self.item_id): if destination.has_key( 'ChannelPriceModelEntityList'): for pricemodel in destination[ 'ChannelPriceModelEntityList']: if pricemodel.has_key( 'ConsumersTypeId' ) and pricemodel.has_key( 'ConsumersTypeName' ) and pricemodel.has_key( 'ChannelPriceEntityList'): consumer_type = pricemodel[ 'ConsumersTypeId'] consumer_type_name = pricemodel[ 'ConsumersTypeName'] t_i = 1 for t_data in pricemodel[ 'ChannelPriceEntityList']: val = (self.item_id, self.item_name, self.channel_type, consumer_type, consumer_type_name, t_i, t_data, self.crawling_begintime) t = Ticket() t.antPage(val) self.item_tickets.append( t.outSql()) t_i += 1 except Exception as e: Common.log('# itemTicket,exception err in load json: %s' % e) Common.traceback_log() # 商品详情页html def itemPage(self): if self.item_url != '': refer_url = self.channel_url page = self.crawler.getData(self.item_url, refer_url) if type(self.crawler.history) is list and len( self.crawler.history) != 0 and re.search( r'302', str(self.crawler.history[0])): if not self.itempage_judge(page): Common.log('#crawler history:') Common.log(self.crawler.history) raise Common.NoPageException( "# itemPage: not find item page, redirecting to other page,id:%s,item_url:%s" % (str(self.item_id), self.item_url)) if not page or page == '': Common.log('#crawler history:') Common.log(self.crawler.history) raise Common.InvalidPageException( "# itemPage: find item page empty,id:%s,item_url:%s" % (str(self.item_id), self.item_url)) self.item_page = page else: raise Common.NoPageException( "# itemPage: not find item page, url is null,id:%s,item_url:%s" % (str(self.item_id), self.item_url)) # 执行 def antPage(self, val): self.channel_id, self.channel_name, self.channel_url, self.channel_type, i_val = val if self.channel_type == 1: self.spotConfig(i_val) def outTuple(self): return (self.channel_id, self.channel_name, self.channel_url, self.channel_type, self.item_position, self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_level, self.item_area, self.item_service, self.item_comment, self.item_comment_good, self.item_comment_rate, self.item_oriprice, self.item_disprice, self.item_discount, self.crawling_beginDate, self.crawling_beginHour) def outSql(self): return (Common.time_s(float(self.crawling_time)), str(self.item_id), self.item_name, self.item_desc, self.item_url, self.item_pic_url, str(self.item_book_status), self.item_level, self.item_area, self.item_service, str(self.item_comment), str(self.item_comment_good), self.item_comment_rate, str(self.item_oriprice), str(self.item_disprice), str(self.item_discount), str(self.channel_id), str(self.item_position), self.crawling_beginDate, self.crawling_beginHour)
class Item(): '''A class of tc Item''' def __init__(self): # 商品页面抓取设置 self.crawler = TCCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 单品类型商品所属频道 self.channel_id = '' self.channel_name = '' self.channel_url = '' self.channel_type = '' self.item_position = 0 # 商品信息 self.item_id = '' # 商品Id self.item_url = '' # 商品链接 self.item_pic_url = '' # 商品展示图片链接 self.item_name = '' # 商品Name self.item_desc = '' # 商品说明 self.item_book_status = 1 # 商品是否售卖 0:不售,1:在售 self.item_level = '' # 级别 self.item_area = '' # 地址 self.item_service = '' # 服务 self.item_comment = '' # 评论数 self.item_comment_rate = '' # 好评率 self.item_comment_good = '' # 好评数 # 商品交易 self.item_oriprice = '' # 商品原价 self.item_disprice = '' # 商品折扣价 self.item_discount = '' # 商品打折 # 门票 self.item_tickets = [] # 原数据信息 self.item_pageData = '' # 商品所属数据项内容 self.item_page = '' # 商品页面html内容 self.item_pages = {} # 商品页面内请求数据列表 # 商品页信息 def spotConfig(self, _val): self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_level, self.item_area, self.item_position, self.crawling_begintime = _val # 本次抓取开始日期 self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime)) # 本次抓取开始小时 self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime)) if self.item_book_status == 1: # 商品页信息 self.itemPage() page = self.item_page self.item_pages['item-home'] = (self.item_url, self.item_page) m = re.search(r'<div class="s-ppp">\s*<div class="s-tp">(.+?)</div>', page, flags=re.S) if m: i_disprice = re.sub(r'<.+?>', '', m.group(1)) m = re.search(r'(\d+)', i_disprice) if m: self.item_disprice = m.group(1) else: self.item_disprice = Common.htmlDecode(i_disprice) m = re.search(r'<div class="s-ppp">.+?<div class="s-mp"><s>(.+?)</s>\s+<span>(.+?)</span>', page, flags=re.S) if m: i_oriprice, i_discount = m.group(1), m.group(2) m = re.search(r'(\d+)', i_oriprice) if m: self.item_oriprice = m.group(1) else: self.item_oriprice = Common.htmlDecode(i_oriprice) m = re.search(r'(\d+\.\d+)', i_discount) if m: self.item_discount = m.group(1) else: m = re.search(r'(\d+)', i_discount) if m: self.item_discount = m.group(1) else: self.item_discount = i_discount m = re.search(r'<div class="s-sr">\s+<span.+?>(.+?)</div>', page, flags=re.S) if m: self.item_service = re.sub(r'<.+?>', '', ';'.join(m.group(1).split())) self.itemComment() self.itemTicket() def itemComment(self): c_url = 'http://tctj.ly.com/jrec/wlfrec?_dAjax=callback&cid=105&userId=1001&rid=%d&projectId=3&type=1&flag=2&pageSize=10&page=1&callback=tc%s' % (int(self.item_id), Common.rand_n(11)) #c_url = 'http://tctj.ly.com/jrec/wlfrec?cid=105&userId=1001&rid=%d&projectId=3&type=0&flag=1&pageSize=10&page=1&_dAjax=callback&callback=tc%s' % (int(self.item_id), Common.rand_n(11)) c_page = self.crawler.getData(c_url, self.item_url) if c_page: m = re.search(r'"count":.+?"all":"(\d+)"', c_page, flags=re.S) if m: self.item_comment = m.group(1) m = re.search(r'"count":.+?"satisfaction":"(.+?)"', c_page, flags=re.S) if m: self.item_comment_rate = m.group(1) m = re.search(r'"count":.+?"good":"(\d+)"', c_page, flags=re.S) if m: self.item_comment_good = m.group(1) def itemTicket(self): t_url = 'http://www.ly.com/scenery/AjaxHelper/SceneryPriceFrame.aspx?action=GETNEWPRICEFRAMEFORLAST&ids=%d&isSimple=1&isShowAppThree=0&widthtype=1&isGrap=1&nobookid=&isyry=1&YpState=1&lon=null&lat=null' % int(self.item_id) result = self.crawler.getData(t_url, self.item_url) if result: try: scenery = json.loads(result) if scenery.has_key('SceneryPrices'): scenery_list = scenery['SceneryPrices'] for destination in scenery_list: if destination.has_key('DestinationId') and int(destination['DestinationId']) == int(self.item_id): if destination.has_key('ChannelPriceModelEntityList'): for pricemodel in destination['ChannelPriceModelEntityList']: if pricemodel.has_key('ConsumersTypeId') and pricemodel.has_key('ConsumersTypeName') and pricemodel.has_key('ChannelPriceEntityList'): consumer_type = pricemodel['ConsumersTypeId'] consumer_type_name = pricemodel['ConsumersTypeName'] t_i = 1 for t_data in pricemodel['ChannelPriceEntityList']: val = (self.item_id, self.item_name, self.channel_type, consumer_type, consumer_type_name, t_i, t_data, self.crawling_begintime) t = Ticket() t.antPage(val) self.item_tickets.append(t.outSql()) t_i += 1 except Exception as e: Common.log('# itemTicket,exception err in load json: %s' % e) Common.traceback_log() # 商品详情页html def itemPage(self): if self.item_url != '': refer_url = self.channel_url page = self.crawler.getData(self.item_url, refer_url) if type(self.crawler.history) is list and len(self.crawler.history) != 0 and re.search(r'302',str(self.crawler.history[0])): if not self.itempage_judge(page): Common.log('#crawler history:') Common.log(self.crawler.history) raise Common.NoPageException("# itemPage: not find item page, redirecting to other page,id:%s,item_url:%s"%(str(self.item_id), self.item_url)) if not page or page == '': Common.log('#crawler history:') Common.log(self.crawler.history) raise Common.InvalidPageException("# itemPage: find item page empty,id:%s,item_url:%s"%(str(self.item_id), self.item_url)) self.item_page = page else: raise Common.NoPageException("# itemPage: not find item page, url is null,id:%s,item_url:%s"%(str(self.item_id), self.item_url)) # 执行 def antPage(self, val): self.channel_id, self.channel_name, self.channel_url, self.channel_type, i_val = val if self.channel_type == 1: self.spotConfig(i_val) def outTuple(self): return (self.channel_id, self.channel_name, self.channel_url, self.channel_type, self.item_position, self.item_book_status, self.item_id, self.item_url, self.item_pic_url, self.item_name, self.item_desc, self.item_level, self.item_area, self.item_service, self.item_comment, self.item_comment_good, self.item_comment_rate, self.item_oriprice, self.item_disprice, self.item_discount, self.crawling_beginDate, self.crawling_beginHour) def outSql(self): return (Common.time_s(float(self.crawling_time)), str(self.item_id), self.item_name, self.item_desc, self.item_url, self.item_pic_url, str(self.item_book_status), self.item_level, self.item_area, self.item_service, str(self.item_comment), str(self.item_comment_good), self.item_comment_rate, str(self.item_oriprice), str(self.item_disprice), str(self.item_discount), str(self.channel_id), str(self.item_position), self.crawling_beginDate, self.crawling_beginHour)
class RetryCrawler(): '''A class of retry crawl data''' def __init__(self): # 抓取设置 self.crawler = TCCrawler() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' # wait time self.w_time = 1 # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' % (_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def getData(self, url, refers='', max_retry=20): page = '' retry = 1 while True: try: page = self.crawler.getData(url, refers) break except Common.InvalidPageException as e: if retry >= max_retry: break retry += 1 Common.log('# Invalid page exception: %s' % e) time.sleep(self.w_time * retry) except Common.DenypageException as e: if retry >= max_retry: break retry += 1 Common.log('# Deny page exception: %s' % e) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10, 30)) except Common.SystemBusyException as e: if retry >= max_retry: break retry += 1 Common.log('# System busy exception: %s' % e) time.sleep(self.w_time * retry) except Exception as e: Common.log('# exception err in retry crawler: %s' % e) if str(e).find('Read timed out') != -1: if retry >= max_retry: break retry += 1 time.sleep(random.uniform(1, 3)) elif str(e).find('Name or service not known') != -1 or str( e).find('Temporary failure in name resolution'): if retry >= max_retry: break retry += 1 # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10, 30)) else: break return page
class Channel(): '''A class of TC channel''' def __init__(self): # 抓取设置 self.crawler = TCCrawler() self.retrycrawler = RetryCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_time_s = Common.time_s(self.crawling_time) self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 频道信息 self.platform = '同程-pc' # 品牌团所在平台 self.channel_id = '' # 频道id self.channel_url = '' # 频道链接 self.channel_name = '' # 频道name self.channel_type = '' # 频道类型 # 原数据信息 self.channel_page = '' # 频道页面html内容 self.channel_pages = {} # 频道页面内请求数据列表 # channel items self.channel_items = [] # channel list self.channel_list = [] # 频道页初始化 def init(self, channel_id, channel_url, channel_type, begin_time): self.channel_id = channel_id self.channel_url = channel_url self.channel_type = channel_type self.crawling_begintime = begin_time self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_begintime)) self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_begintime)) def config(self): self.channelPage() if self.channel_type == 1: self.spot() #elif self.channel_type == 2: else: Common.log('# not find this channel type...') def spot(self): if self.channel_page: m = re.search(r'<title>(.+?)</title>', self.channel_page, flags=re.S) if m: self.channel_name = m.group(1) keyword, pid, cid, cyid = '', 0, 0, 0 m = re.search(r'<span id="hdKeyWord">(.*?)</span>', self.channel_page, flags=re.S) if m: keyword = m.group(1) m = re.search(r'<span id="hdPid">(.*?)</span>', self.channel_page, flags=re.S) if m: pid = int(m.group(1)) m = re.search(r'<span id="hdCid">(.*?)</span>', self.channel_page, flags=re.S) if m: cid = int(m.group(1)) m = re.search(r'<span id="hdCyid">(.*?)</span>', self.channel_page, flags=re.S) if m: cyid = int(m.group(1)) i_p = 1 i_page = 1 m_page = 1 page_main = '' m = re.search(r'<div class="scenery_main" id="sceneryListInfo">(.+?)<div id="pageNum_box" class="s_pager none">', self.channel_page, flags=re.S) if m: page_main = m.group(1) else: page_main = self.channel_page Common.log(i_page) i_p = self.get_items(page_main, i_p) m = re.search(r'<input type="hidden" id="txt_AllpageNumber" value="(.+?)">', page_main, flags=re.S) if m: m_page = int(m.group(1)) page_url = 'http://www.ly.com/scenery/SearchList.aspx?&action=getlist&page=%d&kw=&pid=%d&cid=%d&cyid=%d&theme=0&grade=0&money=0&sort=0&paytype=0&ismem=0&istuan=0&isnow=0&spType=&isyiyuan=0&lbtypes=&IsNJL=0&classify=0' while i_page < m_page: i_page += 1 p_url = page_url % (i_page, pid, cid, cyid) Common.log(i_page) page = self.retrycrawler.getData(p_url, self.channel_url) i_p = self.get_items(page, i_p) def get_items(self, page_main, i_p): if page_main: p = re.compile(r'<div class="scenery_list(.+?)">\s*<div class="s_info"><div class="img_con"><a class="a_img".+?href="(.+?)"><img.+?src="(.+?)".*?></a></div><div class="info_con"><dl class="info_top"><dt><a class="fir_name".+?>(.+?)</a>.+?<span class="s_level">(.*?)</span>.+?<dd class="scenery_area"><span>(.+?)</span>.+?</dl></div></div>', flags=re.S) for info in p.finditer(page_main): all_info, i_info, i_url, i_img, i_name, i_level, i_area = info.group(), info.group(1), (Config.tc_home + info.group(2)), info.group(3), info.group(4), re.sub(r'<.+?>', '', info.group(5)), ' '.join(info.group(6).split()) i_book = 1 i_desc = '' m = re.search(r'<dd class="scenery_desc"><p>(.+?)</p>', all_info, flags=re.S) if m: i_desc = m.group(1) if i_info.find('nobook') != -1: i_book = 0 if i_desc == '': m = re.search(r'<dd class="scenery_state">(.+?)<a', all_info, flags=re.S) if m: i_desc = m.group(1) i_id = 0 if i_url != '': m = re.search(r'BookSceneryTicket_(\d+).html', i_url) if m: i_id = m.group(1) val = (self.channel_id, self.channel_name, self.channel_url, self.channel_type, (i_book, i_id, i_url, i_img, i_name, i_desc, i_level, i_area, i_p, self.crawling_begintime)) self.channel_items.append(val) #if i_p == 1: Common.log(val) i_p += 1 return i_p def channelList(self): self.channelPage() if self.channel_page: city_list = self.moreCity(self.channel_page, 'city') for city in city_list: city_url, city_name, province_id, city_id, dcity_id = city if city_url: province_name, city_name = '', '' city_page = self.crawler.getData(city_url, self.channel_url) if city_page: m = re.search(r'<div class="search_screen_dl"><dl action="province">.+?<div class="right"><a href=".+?" class="current" tvalue="(\d+)" title="(.+?)">', city_page, flags=re.S) if m: province_id, province_name = m.group(1), m.group(2) m = re.search(r'<div class="search_screen_dl">.+?<dl action="city">.+?<div class="right"><a href=".+?" class="current" tvalue="(\d+)" title="(.+?)">', city_page, flags=re.S) if m: city_id, city_name = m.group(1), m.group(2) channel_list = self.moreCity(city_page, 'district') if channel_list and len(channel_list) > 0: for channel in channel_list: channel_url, channel_name, p_id, c_id, cy_id = channel self.channel_list.append((str(cy_id), channel_name, channel_url, str(self.channel_type), city_id, city_name, province_id, province_name)) def moreCity(self, page, action_key): city_list = [] if page: p_id, c_id, cy_id = 0, 0, 0 m = re.search(r'<span id="hdPid">(.*?)</span>', page, flags=re.S) if m: p_id = int(m.group(1)) m = re.search(r'<span id="hdCid">(.*?)</span>', page, flags=re.S) if m: c_id = int(m.group(1)) m = re.search(r'<span id="hdCyid">(.*?)</span>', page, flags=re.S) if m: cy_id = int(m.group(1)) p_url = 'http://www.ly.com/scenery/scenerysearchlist_%d_%d__0_0_0_%d_0_0_0.html' m = re.search(r'<div class="search_screen_box" id="searchScreenBox">.+?<dl action="%s".+?>.+?<dd>.+?<div class="right">(.+?)</div></dd>' % action_key, page, flags=re.S) if m: city_infos = m.group(1) p = re.compile(r'<a.+?tvalue="(\d+)" title="(.+?)">', flags=re.S) for city in p.finditer(city_infos): city_id, city_name = int(city.group(1)), city.group(2) if action_key == 'city': c_id = city_id elif action_key == 'district': cy_id = city_id city_url, city_name = p_url % (p_id, c_id, cy_id), city.group(2) city_list.append((city_url, city_name, p_id, c_id, cy_id)) return city_list def channelPage(self): if self.channel_url: data = self.crawler.getData(self.channel_url, Config.tc_home) if not data and data == '': raise Common.InvalidPageException("# channelPage:not find channel page,channel_id:%s,channel_url:%s"%(str(self.channel_id), self.channel_url)) if data and data != '': self.channel_page = data self.channel_pages['channel-home'] = (self.channel_url, data) def antPage(self, val): channel_id, channel_url, channel_type, begin_time = val self.init(channel_id, channel_url, channel_type, begin_time) self.config() def antChannelList(self, val): self.channel_url, self.channel_type = val self.channelList()
class RetryCrawler(): '''A class of retry crawl data''' def __init__(self): # 抓取设置 self.crawler = TCCrawler() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' # wait time self.w_time = 1 # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def getData(self, url, refers='', max_retry=20): page = '' retry = 1 while True: try: page = self.crawler.getData(url, refers) break except Common.InvalidPageException as e: if retry >= max_retry: break retry += 1 Common.log('# Invalid page exception: %s' % e) time.sleep(self.w_time*retry) except Common.DenypageException as e: if retry >= max_retry: break retry += 1 Common.log('# Deny page exception: %s' % e) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: if retry >= max_retry: break retry += 1 Common.log('# System busy exception: %s' % e) time.sleep(self.w_time*retry) except Exception as e: Common.log('# exception err in retry crawler: %s' % e) if str(e).find('Read timed out') != -1: if retry >= max_retry: break retry += 1 time.sleep(random.uniform(1,3)) elif str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution'): if retry >= max_retry: break retry += 1 # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10,30)) else: break return page