# default. All the other settings are documented here: # # http://doc.scrapy.org/en/latest/topics/settings.html # from scrapy.settings.default_settings import SPIDER_MIDDLEWARES, DOWNLOADER_MIDDLEWARES BOT_NAME = 'topical-spiders' SPIDER_MODULES = ['topical-spiders.spiders'] NEWSPIDER_MODULE = 'topical-spiders.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'topic (+http://www.yourdomain.com)' SPIDER_MIDDLEWARES.update({ 'crawlfrontier.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000, }) DOWNLOADER_MIDDLEWARES.update({ 'crawlfrontier.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 1000, }) FRONTIER_SETTINGS = 'frontier.settings' SCHEDULER = 'crawlfrontier.contrib.scrapy.schedulers.frontier.CrawlFrontierScheduler' SPIDER_MIDDLEWARES.update({ 'crawlfrontier.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, 'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': None, 'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None, 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': None, 'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': None
DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } ITEM_PIPELINES = { 'scrapy_spider.pipelines.MySQLStorePipeline': 300, } SPIDER_MIDDLEWARES.update({ 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000, 'scrapy.spidermiddleware.depth.DepthMiddleware': None, 'scrapy.spidermiddleware.offsite.OffsiteMiddleware': None, 'scrapy.spidermiddleware.referer.RefererMiddleware': None, 'scrapy.spidermiddleware.urllength.UrlLengthMiddleware': None }) DOWNLOADER_MIDDLEWARES = { # Engine side 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, # Downloader side }
# Obey robots.txt rules ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } ITEM_PIPELINES = { 'scrapy_spider.pipelines.MySQLStorePipeline': 300, } SPIDER_MIDDLEWARES.update({ 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000, 'scrapy.spidermiddleware.depth.DepthMiddleware': None, 'scrapy.spidermiddleware.offsite.OffsiteMiddleware': None, 'scrapy.spidermiddleware.referer.RefererMiddleware': None, 'scrapy.spidermiddleware.urllength.UrlLengthMiddleware': None }) DOWNLOADER_MIDDLEWARES = { # Engine side 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, # Downloader side } DOWNLOADER_MIDDLEWARES.update({ # Frontera 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 1000,
# -*- coding: utf-8 -*- from scrapy.settings.default_settings import SPIDER_MIDDLEWARES, DOWNLOADER_MIDDLEWARES FRONTERA_SETTINGS = 'bc.config.spider' SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler' SPIDER_MIDDLEWARES.update({ 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999, 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, }) DOWNLOADER_MIDDLEWARES.update({ 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999, }) BOT_NAME = 'bc' SPIDER_MODULES = ['bc.spiders'] NEWSPIDER_MODULE = 'bc.spiders' CONCURRENT_REQUESTS=256 CONCURRENT_REQUESTS_PER_DOMAIN=1 DOWNLOAD_DELAY=0.0 DOWNLOAD_TIMEOUT=180 RANDOMIZE_DOWNLOAD_DELAY = False REACTOR_THREADPOOL_MAXSIZE = 30 DNS_TIMEOUT = 120 COOKIES_ENABLED=False RETRY_ENABLED = False
# -*- coding: utf-8 -*- from scrapy.settings.default_settings import SPIDER_MIDDLEWARES, DOWNLOADER_MIDDLEWARES FRONTERA_SETTINGS = 'bc.config.spider' SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler' SPIDER_MIDDLEWARES.update({ 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999, 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, }) DOWNLOADER_MIDDLEWARES.update({ 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999, }) BOT_NAME = 'bc' SPIDER_MODULES = ['bc.spiders'] NEWSPIDER_MODULE = 'bc.spiders' CONCURRENT_REQUESTS = 256 CONCURRENT_REQUESTS_PER_DOMAIN = 1 DOWNLOAD_DELAY = 0.0 DOWNLOAD_TIMEOUT = 180 RANDOMIZE_DOWNLOAD_DELAY = False REACTOR_THREADPOOL_MAXSIZE = 30 DNS_TIMEOUT = 120