def gen_spider(spider, domain): pm = SpiderManager() if spider in pm.get_list(): raise Exception(u"Spider %s exists." % spider) default_filename = "spiders/%s.py" % spider with open("utils/spider.tpl") as t: content = t.read() content %= {'Spider': spider.title(), 'spider': spider, 'domain': domain} with open(default_filename, "w") as f: f.write(content) print u"Spider %s has been create into %s." % (spider, default_filename)
def install(self, options): self.event = Event() sm = SpiderManager() self.spider_class = sm.create(options.spider) spider_qk = "spider:q:%s" % self.spider_class.name schedule_qk = "schedule:q:%s" % self.spider_class.name self.log = Logger().getlog(level=logging.getLevelName(options.level)) self.spider_queue = Queue(spider_qk, self.redis) self.schedule_queue = Queue(schedule_qk, self.redis) self.pipeline_queue = queue.Queue() self.fp = Dupefilter.from_crawler(self) self.fp.open(self.spider_class.name) self.forever = options.forever self.running = False if options.urls: self.addition_urls = options.urls.split(',') else: self.addition_urls = [] self.schedule = Schedule(self)
def list_spiders(): sm = SpiderManager() print ', '.join(sm.get_list())