def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its scrapy object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name settings = self.crawler.settings self.server = ConnectionFactory().create_redis_connection(settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) #self.scrapy.signals.connect(self.spider_closed, signal=signals.spider_closed) self.log("Reading URLs from redis list '%s'" % self.redis_key) self.paused = False
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its scrapy object. """ if not self.redis_key: self.redis_key = "%s:start_urls" % self.name settings = self.crawler.settings self.server = ConnectionFactory().create_redis_connection(settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) # self.scrapy.signals.connect(self.spider_closed, signal=signals.spider_closed) self.log("Reading URLs from redis list '%s'" % self.redis_key) self.paused = False
class RedisMixin(object): """Mixin class to implement reading urls from a redis queue.""" redis_key = None # use default '<spider>:start_urls' def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its scrapy object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name settings = self.crawler.settings self.server = ConnectionFactory().create_redis_connection(settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) #self.scrapy.signals.connect(self.spider_closed, signal=signals.spider_closed) self.log("Reading URLs from redis list '%s'" % self.redis_key) self.paused = False def next_request(self): """Returns a request to be scheduled or none.""" use_set = self.settings.getbool('REDIS_SET') if use_set: url = self.server.spop(self.redis_key) else: url = self.server.lpop(self.redis_key) if url: t = pickle.loads(url) #print t['cookies'] print t['link_hash'] print t['product_code'] cookie = '' if t['cookies'] is not None: print t['cookies'] if t['cookies'] != '': cookie = eval((t['cookies'])) return Request(t['url'], cookies=cookie, meta={ 'product_code': t['product_code'], 'link_hash': t['link_hash'] }, dont_filter=True) #return self.make_requests_from_url(t['url']) def schedule_next_request(self): """Schedules a request if available""" req = self.next_request() if req: self.crawler.engine.crawl(req, spider=self) def spider_idle(self): """Schedules a request if available, otherwise waits.""" #if self.paused==False: if not self.crawler.engine.paused: self.schedule_next_request() raise DontCloseSpider def item_scraped(self, *args, **kwargs): """Avoids waiting for the spider to idle before scheduling the next request""" self.schedule_next_request() def spider_pause(self): self.paused = True def spider_resume(self): self.paused = False
class RedisMixin(object): """Mixin class to implement reading urls from a redis queue.""" redis_key = None # use default '<spider>:start_urls' def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its scrapy object. """ if not self.redis_key: self.redis_key = "%s:start_urls" % self.name settings = self.crawler.settings self.server = ConnectionFactory().create_redis_connection(settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) # self.scrapy.signals.connect(self.spider_closed, signal=signals.spider_closed) self.log("Reading URLs from redis list '%s'" % self.redis_key) self.paused = False def next_request(self): """Returns a request to be scheduled or none.""" use_set = self.settings.getbool("REDIS_SET") if use_set: url = self.server.spop(self.redis_key) else: url = self.server.lpop(self.redis_key) if url: t = pickle.loads(url) # print t['cookies'] print t["link_hash"] print t["product_code"] cookie = "" if t["cookies"] is not None: print t["cookies"] if t["cookies"] != "": cookie = eval((t["cookies"])) return Request( t["url"], cookies=cookie, meta={"product_code": t["product_code"], "link_hash": t["link_hash"]}, dont_filter=True, ) # return self.make_requests_from_url(t['url']) def schedule_next_request(self): """Schedules a request if available""" req = self.next_request() if req: self.crawler.engine.crawl(req, spider=self) def spider_idle(self): """Schedules a request if available, otherwise waits.""" # if self.paused==False: if not self.crawler.engine.paused: self.schedule_next_request() raise DontCloseSpider def item_scraped(self, *args, **kwargs): """Avoids waiting for the spider to idle before scheduling the next request""" self.schedule_next_request() def spider_pause(self): self.paused = True def spider_resume(self): self.paused = False
def __init__(self): self.client = ConnectionFactory().create_kafka_connection( self.settings)