def open_spider(self, spider): self.session = DBSession()
settings.set( "USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36" ) settings.set( "ITEM_PIPELINES", { # 'pipelines.DuplicatesPipeline': 200, # 'pipelines.CountDropPipline': 100, 'pipelines.SeprojectPipeline': 300 }) # settings.set("LOG_LEVEL","INFO") # process = CrawlerProcess(settings) db = DBSession() rules = db.query(Rule).filter(Rule.enable == 1) for rule in rules: crawler = Crawler(settings) spider = BeijingSpider(rule) # instantiate every spider using rule RUNNING_CRAWLERS.append(spider) # stop reactor when spider closes crawler.signals.connect(spider_closing, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() # blocks process so always keep as the last statement reactor.run()
def start_requests(self): db = DBSession() sections = db.query(ChangzhiServerSection).all() for section in sections: yield Request(section.url,self.parse,meta={'section_id':section.id})
def open_spider(self, spider): self.session = DBSession() sql2 = "DELETE FROM `RawData` WHERE `spider` = '" + spider.name + "'" engine.execute(sql2)