def start_requests(self): citys = rdb.lrange('dzdp_citys', 0, -1) if citys: self.getCityChannel(citys) else: url = 'http://www.dianping.com/citylist' yield Request(url, callback=self.parseCitys, dont_filter=True)
def getCityChannel(self, citys=None): if not citys: citys = rdb.lrange('dzdp_citys', 0, -1) for data in citys: self.log('请求获取城市频道: {}'.format(data)) data = pickle.loads(data) rdb.set('dzdp_cur_city', data.get('en_name')) href = data.get('href') for i in self.classify[:1]: href = '{}/g{}'.format(href, i) yield Request(href, callback=self.parseItems, dont_filter=True)
def parse(self, response): for query in response.css( 'div.searchPdListCon div.searchPdListConL div.gysItemsWrap'): href = query.xpath( './div[@class="gysItems"]/div[@class="gysItemsL"]/a/@href' ).extract_first() rdb.rpush('yhby_urls', href + 'contact.html') rdb.set('yhby_page', self.count) if self.count == 99: hrefs = [href.decode() for href in rdb.lrange('yhby_urls', 0, -1)] for href in hrefs: yield Request(href, self.parseCompany, dont_filter=True)
def start_requests(self): self.log('start request start:**********************', self.count) start = self.count if start == 99: hrefs = [href.decode() for href in rdb.lrange('yhby_urls', 0, -1)] for href in hrefs: yield Request(href, self.parseCompany, dont_filter=True) else: for i in range(start, 100): self.count = i url = 'http://www.youboy.com/scom?kw=&p={}'.format(i) yield Request(url, self.parse, dont_filter=True)
def getChannelClassify(self, city, channels=None, specialName=''): if not specialName: if not channels: key = 'dzdp_{}_channels'.format(city) channels = rdb.lrange(key, 0, -1) for channel in channels: key = 'dzdp_{}_cur_channel'.format(city) self.log('获取频道的分类信息:{}'.format(channel)) channel = pickle.loads(channel) rdb.rset(key, channel.get('name')) yield Request(channel.get('href'), self.parseClassify, dont_filter=True) else: key = 'dzdp_{}_channels_hash'.format(city) channel_href = rdb.hget(key, specialName) self.log('获取{}频道分类信息:{}'.format(specialName, channel_href)) yield Request(channel_href, self.parseClassify, dont_filter=True)