def __init__(self, *args, **kwargs): curPage = int( rdb.get('jqw_cur_page').decode()) if rdb.get('jqw_cur_page') else 0 maxPage = int( rdb.get('jqw_pages').decode()) if rdb.get('jqw_pages') else 10000 self.curPage = curPage self.maxPage = 500 self.cityCode = [ 1, 129, 350, 371, 453, 573, 686, 840, 973, 1095, 1138, 1170, 1223, 1245, 1409, 1551, 1726, 1866, 1887, 1991, 2104, 2242, 2346, 2393, 2588, 2782, 2941, 3091, 3169, 3302, 3431, 3554, 3973, 4002 ] super(JQSpider, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): self.count = rdb.get('yhby_page') if not self.count: self.count = 1 else: self.count = int(self.count.decode()) super(YhbySpider, self).__init__(*args, **kwargs)
def get_new_proxy(self): proxy = rdb.get('proxy') if not proxy: proxy = requests.get("http://182.92.190.100:5010/get/").content if proxy: if isinstance(proxy, bytes): proxy = proxy.decode() return 'http://' + proxy return proxy.decode()
def __init__(self, *args, **kwargs): super(CslmSpider, self).__init__(*args, **kwargs) self.domain = 'http://www.ccoo.cn/' self.cur_code = 0 if not rdb.get('cslm_city') else int( rdb.get('cslm_city').decode()) self.cur_url = '' if not rdb.get('cslm_url') else rdb.get( 'cslm_url').decode() city_codes = [ 150, 153, 156, 182, 184, 196, 206, 212, 227, 228, 250, 276, 291, 301, 312, 337, 366, 379, 396, 399, 419, 430, 441, 443, 453, 466, 476, 776, 778, 777, 3251 ] if self.cur_code in city_codes: index = city_codes.index(self.cur_code) city_codes = city_codes[index:] self.city_codes = city_codes self.log('init cur_code:{} cur_url: {}'.format(self.cur_code, self.cur_url))
def parseCityChannel(self, response): ''' 获取城市的频道(美食,电影,休闲等) ''' city = rdb.get('dzdp_cur_city') if isinstance(city, bytes): city = city.decode() channels = response.css( 'div.J_filter_channel div.nc-contain div div a') for channel in channels: name = channel.xpath('./text()').extract_first() href = channel.xpath('./@href').extract_first() data = pickle.dumps({'name': name, 'href': href}) key = 'dzdp_{}_channels'.format(city) rdb.rpush(key, data) hkey = '{}_hash'.format(key) rdb.hset(hkey, name, href) self.getChannelClassify(city, None, '生活服务')