def filter_link(link): key = getattr(settings, 'DUPEFILTER_LINK_KEY') or self.DUPEFILTER_LINK_KEY added = r.sadd(key, link.url) if not added: forbidden_key = getattr(settings, 'FORBIDDEN_KEY') if forbidden_key and link.url in self.server.smembers(forbidden_key): return not added log.msg(message='(spider:dupefilter:link) - %s' % link.url, level=log.DEBUG) return added
def parse(self, response): people = response.url[:-len('joins')] groups = response.xpath('//li[@class=""]') for group in groups: page = group.xpath('.//div[@class="title"]/a/@href').extract()[0] name = group.xpath('.//div[@class="title"]/a/@title').extract()[0] # (xxx) num = group.xpath('.//span[@class="num"]/text()').extract()[0][1:-1] added = r.sadd(self.name + '_group', page) if added: yield GroupItem({ 'page': page, 'name': name, 'num': int(num), }) yield JoinsItem({ 'people': people, 'group': page })