def login(self, response): print "loging..." return FormRequest.from_response(response, formdata={'username': Settings().get('MUUSERNAME'), 'userpass': Settings().get('MUPASSWORD')}, callback=self.after_login, #dont forget dont_filter, without it the after_login will not be loaded. dont_filter=True, meta=response.meta)
def instancialize_database(self): if(self.dbase == None): dbname = Settings().get('DBNAME') dbuser = Settings().get('DBUSERNAME') dbpass = Settings().get('DBPASSWORD') dbhost = Settings().get('DBHOST') dbport = Settings().get('DBPORT') self.dbase = database.Database(dbname, dbuser,dbpass, dbhost,dbport) if(self.dbase.connect() == False): self.dbase = None raise SystemExit
def shub_conn(): # don't use default `.get()` property because then it will evaluate # `settings.SH_API_KEY` anyway and you might not have setup it locally api_key = os.environ.get('SH_API_KEY') or Settings().get('SH_API_KEY') # NOTE not really safe when `name` doesn't exist return ScrapinghubClient(api_key)
def global_settings(namespace='project_settings'): """Unify Scrapy and Scrapinghub settings into one entrypoint. See the following for namespaces of Scrapinghub settings: http://shub.readthedocs.io/en/stable/custom-images-contract.html#shub-settings As a side effect, also try to make it work locally in order to have a smooth experience going back and forth between development and playground/production. NOTE 1: It turns out that shub doubly escapes settings before putting them into env['SHUB_SETTINGS'] It means that '\n' in a settings will be stored as '\\\\n' in 'SHUB_SETTINGS'. json.loads does a single unescape, so we need to unescape a second time. NOTE 2: A good way of not having to update every file each time we change how we import settings is to use it this way: from kp_scrapers.lib.services.shub import global_settings as Settings """ raw_shub_settings = ( os.environ.get('SHUB_SETTINGS', '{}').encode('utf-8').decode('unicode_escape') ) shub_settings = json.loads(raw_shub_settings).get(namespace) scrapy_settings = Settings().copy() # merge them scrapy_settings.update(shub_settings) return scrapy_settings
def run_single_spider(spider_name,cin): settings = Settings() process = CrawlerProcess(settings) spider_loader = spiderloader.SpiderLoader.from_settings(settings) try: spider_object = spider_loader.load(spider_name,cin=cin) process.crawl(spider_object) process.start() # post_execution = SpiderPostExecution() # post_execution.engine_stopped([spider_object]) except Exception as e: raise (e)
def run_spiders(): settings = Settings() process = CrawlerProcess(settings) spider_loader = spiderloader.SpiderLoader.from_settings(settings) spiders = spider_loader.list() spider_objects = [spider_loader.load(name) for name in spiders] exec_report.number_of_scrapers = len(spider_objects) for spider in spider_objects: process.crawl(spider) process.start() post_execution = SpiderPostExecution() post_execution.engine_stopped(spider_objects)
def __init__(self, useCache=True): # Initialize the required resources # Scrapy needs to run inside twisted reactor -- Start the process configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) self.settings = { 'DOWNLOAD_DELAY': 3, 'CONCURRENT_REQUESTS': 20, 'ROBOTSTXT_OBEY': False, 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0', 'AUTOTHROTTLE_ENABLED': True, 'HTTPCACHE_ENABLED': False, # Cache enabled for testing 'HTTPCACHE_EXPIRATION_SECS': 0, 'TELNETCONSOLE_PORT': None, 'RETRY_ENABLED': False, 'REDIRECT_ENABLED': False, 'COOKIES_ENABLED': False, 'REACTOR_THREADPOOL_MAXSIZE': 20, 'DOWNLOAD_TIMEOUT': 30, # To avoid loss of entries? # Retry many times since proxies often fail 'RETRY_TIMES': 10, # Retry on most error codes since proxies fail for different reasons 'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408], 'DOWNLOADER_MIDDLEWARES': { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 610, 'random_useragent.RandomUserAgentMiddleware': 400, 'rotating_proxies.middlewares.RotatingProxyMiddleware': 110, 'rotating_proxies.middlewares.BanDetectionMiddleware': 620, }, 'PROXY_LIST': PROXY_PATH, 'PROXY_MODE': 0, 'USER_AGENT_LIST': USER_PATH } self.crawlRunner = CrawlerRunner(Settings(self.settings))
def runSpider(symbol): # Scrapy needs to run inside twisted reactor -- Start the process configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings = Settings({ 'ITEM_PIPELINES': { 'src.server.Crawler.JSONPipeline.JSONPipeline': 100, 'src.server.Crawler.RedisPipeline.RedisPipeline': 200 }, 'DOWNLOAD_DELAY': 2, 'ROBOTSTXT_OBEY': True, 'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'AUTOTHROTTLE_ENABLED': True, 'HTTPCACHE_ENABLED': False }) runner = CrawlerRunner(settings=settings) d = runner.crawl(NASDAQSpider, symbol=symbol) d.addBoth(lambda _: reactor.stop()) # Callback to stop the reactor reactor.run() # the script will block here until the crawling is finished
# -*- coding: utf-8 -*- import scrapy import os import json from ..items import get_item_and_loader from scrapy.utils.project import get_project_settings as Settings settings = Settings() class BibleSpider(scrapy.Spider): name = "bible" allowed_domains = ["www.audiobible.com"] start_urls = [] def __init__(self, data_store=settings.get('DATA_STORE'), content_file=settings.get('CONTENT_FILE')): if data_store and content_file: p = os.path.join(data_store, content_file) if os.path.exists(p) and not os.stat(p).st_size == 0: with open(os.path.join(data_store, content_file)) as f: for line in f.readlines(): data = json.loads(line) if data['urls']: for url in data['urls']: self.start_urls.append(url) else: self.start_urls.append( 'http://www.audiobible.com/bible/bible.html')
# -*- coding: utf-8 -*- from scrapy.contrib.spiders.init import InitSpider from scrapy.http import Request from scrapy.selector import Selector from decouple import config import re import json import requests from urlparse import urljoin from scrapy.utils.project import get_project_settings as Settings STGS = Settings() CITY_FROM = 'REC' CITY_TO = 'RIO' NUMBER_ADULTS = 2 DATE_GO = '2014-09-20' DATE_BACK = '2014-09-25' class JSCallError(Exception): def __init__(self, message): super(JSCallError, self).__init__(message) self.message = u'Erro na chamada da função.' def response_to_file(name, response): with open(name, 'wb') as f: f.write(response.body)
runner = CrawlerRunner(settings=Settings({ 'DOWNLOAD_DELAY': 3, 'CONCURRENT_REQUESTS': 20, 'ROBOTSTXT_OBEY': False, 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0', 'AUTOTHROTTLE_ENABLED': True, 'HTTPCACHE_ENABLED': False, # Cache enabled for testing 'HTTPCACHE_EXPIRATION_SECS': 0, 'TELNETCONSOLE_PORT': None, 'RETRY_ENABLED': False, 'REDIRECT_ENABLED': True, 'COOKIES_ENABLED': False, 'REACTOR_THREADPOOL_MAXSIZE': 20, 'DOWNLOAD_TIMEOUT': 30, # To avoid loss of entries? # Retry many times since proxies often fail 'RETRY_TIMES': 10, # Retry on most error codes since proxies fail for different reasons 'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408], 'DOWNLOADER_MIDDLEWARES': { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 610, 'random_useragent.RandomUserAgentMiddleware': 400, 'rotating_proxies.middlewares.RotatingProxyMiddleware': 110, 'rotating_proxies.middlewares.BanDetectionMiddleware': 620, }, 'PROXY_LIST': PROXY_PATH, 'PROXY_MODE': 0, 'USER_AGENT_LIST': USER_PATH }))