from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings settings = Settings() # crawl settings settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36") settings.set("ITEM_PIPELINES" , { # 'pipelines.DuplicatesPipeline': 200, # 'pipelines.CountDropPipline': 100, 'pipelines.MysqlPipeline': 300, }) process = CrawlerProcess(settings) db = DBSession() rules = db.query(Rule).filter(Rule.status == 1) for rule in rules: if rule.allowed_domains=='v.baidu.com': process.crawl(BaiduSpider,rule) elif rule.allowed_domains == 'iqiyi.com': process.crawl(IqiyiSpider,rule) elif rule.allowed_domains == 'youku.com': process.crawl(YoukuSpider,rule) elif rule.allowed_domains == 'douban.com': process.crawl(DoubanSpider,rule) else: process.crawl(MgtvSpider,rule) process.start()
def start_requests(self): db = DBSession() sections = db.query(ChangzhiServerSection).all() for section in sections: yield Request(section.url,self.parse,meta={'section_id':section.id})
# -*- coding: utf-8 -*- from spiders.deep_spider import DeepSpider from model.config import DBSession from model.rule import Rule from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings settings = Settings() # crawl settings settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36") settings.set("ITEM_PIPELINES" , { 'pipelines.DuplicatesPipeline': 200, # 'pipelines.CountDropPipline': 100, 'pipelines.DataBasePipeline': 300, }) process = CrawlerProcess(settings) db = DBSession() rules = db.query(Rule).filter(Rule.enable == 1) for rule in rules: process.crawl(DeepSpider,rule) process.start()
"USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36" ) settings.set( "ITEM_PIPELINES", { # 'pipelines.DuplicatesPipeline': 200, # 'pipelines.CountDropPipline': 100, 'pipelines.SeprojectPipeline': 300 }) # settings.set("LOG_LEVEL","INFO") # process = CrawlerProcess(settings) db = DBSession() rules = db.query(Rule).filter(Rule.enable == 1) for rule in rules: crawler = Crawler(settings) spider = BeijingSpider(rule) # instantiate every spider using rule RUNNING_CRAWLERS.append(spider) # stop reactor when spider closes crawler.signals.connect(spider_closing, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() # blocks process so always keep as the last statement reactor.run() # print(rule.starturl)
time_now = time.strftime("%Y-%m-%d %H:%M:%S") print '当前时间:',time_now #定义日志信息 #log.start(loglevel=log.DEBUG) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36") settings.set("ITEM_PIPELINES" , { 'wuchong.pipelines.DuplicatesPipeline': 200, # 'pipelines.CountDropPipline': 100, 'wuchong.pipelines.DataBasePipeline': 300 }) dbp = DBSession() nump = dbp.query(Project).filter(Project.status == 1).distinct().count() if nump==0: print '暂时没有可运行的项目' dbp.commit() dbp.close() exit() dbr = DBSession() numr = dbr.query(Rules).filter(Rules.enable == 1).distinct().count() if numr==0: print '暂时没有可运行的规则网站' dbr.commit() dbr.close() exit()