def _print_setting(self, opts): if opts.get: print settings_.get(opts.get) elif opts.getbool: print settings_.getbool(opts.getbool) elif opts.getint: print settings_.getint(opts.getint) elif opts.getfloat: print settings_.getfloat(opts.getfloat) elif opts.getlist: print settings_.getlist(opts.getlist)
def __init__(self): if not settings.getbool('MEMUSAGE_ENABLED'): raise NotConfigured if not procfs_supported(): raise NotConfigured self.warned = False self.notify_mails = settings.getlist('MEMUSAGE_NOTIFY') self.limit = settings.getint('MEMUSAGE_LIMIT_MB') * 1024 * 1024 self.warning = settings.getint('MEMUSAGE_WARNING_MB') * 1024 * 1024 self.report = settings.getbool('MEMUSAGE_REPORT') self.mail = MailSender() dispatcher.connect(self.engine_started, signal=signals.engine_started) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self): if not settings.getbool('MEMUSAGE_ENABLED'): raise NotConfigured if not os.path.exists('/proc'): raise NotConfigured self.warned = False self.notify_mails = settings.getlist('MEMUSAGE_NOTIFY') self.limit = settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024 self.warning = settings.getint('MEMUSAGE_WARNING_MB')*1024*1024 self.report = settings.getbool('MEMUSAGE_REPORT') self.mail = MailSender() dispatcher.connect(self.engine_started, signal=signals.engine_started) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self): if not settings.getbool("MEMUSAGE_ENABLED"): raise NotConfigured if not procfs_supported(): raise NotConfigured self.warned = False self.notify_mails = settings.getlist("MEMUSAGE_NOTIFY") self.limit = settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024 self.warning = settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024 self.report = settings.getbool("MEMUSAGE_REPORT") self.mail = MailSender() dispatcher.connect(self.engine_started, signal=signals.engine_started) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self): if not settings.getbool('REDIRECT_ENABLED'): raise NotConfigured self.max_metarefresh_delay = settings.getint( 'REDIRECT_MAX_METAREFRESH_DELAY') self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES') self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
def __init__(self): if not settings.getbool('TELNETCONSOLE_ENABLED'): raise NotConfigured self.protocol = makeProtocol self.noisy = False port = settings.getint('TELNETCONSOLE_PORT') reactor.callWhenRunning(reactor.listenTCP, port, self)
def get_exporter_and_file(self): format = settings['EXPORT_FORMAT'] filename = settings['EXPORT_FILE'] if not format or not filename: raise NotConfigured exp_kwargs = { 'fields_to_export': settings.getlist('EXPORT_FIELDS') or None, 'export_empty_fields': settings.getbool('EXPORT_EMPTY', False), 'encoding': settings.get('EXPORT_ENCODING', 'utf-8'), } file = open(filename, 'wb') if format == 'xml': exp = exporter.XmlItemExporter(file, **exp_kwargs) elif format == 'csv': exp = exporter.CsvItemExporter(file, **exp_kwargs) elif format == 'csv_headers': exp = exporter.CsvItemExporter(file, include_headers_line=True, \ **exp_kwargs) elif format == 'pprint': exp = exporter.PprintItemExporter(file, **exp_kwargs) elif format == 'pickle': exp = exporter.PickleItemExporter(file, **exp_kwargs) elif format == 'json': exp = exporter.JsonLinesItemExporter(file, **exp_kwargs) elif format == 'jsonlines': exp = exporter.JsonItemExporter(file, **exp_kwargs) else: raise NotConfigured("Unsupported export format: %s" % format) return exp, file
def __init__(self, settings=settings): self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING', False) self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES', ['file']) self.ignore_http_codes = map( int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES', []))
def __init__(self): if not settings.getbool('TELNETCONSOLE_ENABLED'): raise NotConfigured self.noisy = False self.portrange = map(int, settings.getlist('TELNETCONSOLE_PORT')) self.host = settings['TELNETCONSOLE_HOST'] dispatcher.connect(self.start_listening, signals.engine_started) dispatcher.connect(self.stop_listening, signals.engine_stopped)
def start(logfile=None, loglevel=None, logstdout=None): """Initialize and start logging facility""" global log_level, started if started or not settings.getbool('LOG_ENABLED'): return log_level = _get_log_level(loglevel) started = True # set log observer if log.defaultObserver: # check twisted log not already started logfile = logfile or settings['LOG_FILE'] or settings['LOGFILE'] if logstdout is None: logstdout = settings.getbool('LOG_STDOUT') file = open(logfile, 'a') if logfile else sys.stderr log.startLogging(file, setStdout=logstdout)
def __init__(self): if not settings.getbool('AUTOTHROTTLE_ENABLED'): raise NotConfigured dispatcher.connect(self.spider_opened, signal=signals.spider_opened) dispatcher.connect(self.spider_closed, signal=signals.spider_closed) dispatcher.connect(self.response_received, signal=signals.response_received) self.last_latencies = {} self.last_lat = {}
def __init__(self): if not settings.getbool('WEBCONSOLE_ENABLED'): raise NotConfigured logfile = settings['WEBCONSOLE_LOGFILE'] server.Site.__init__(self, WebConsoleResource(), logPath=logfile) self.noisy = False port = settings.getint('WEBCONSOLE_PORT') reactor.callWhenRunning(reactor.listenTCP, port, self)
def __init__(self): if not settings.getbool('ROBOTSTXT_OBEY'): raise NotConfigured self._parsers = {} self._spider_netlocs = {} self._useragents = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed)
def create_report(self, figures): s = "" s += "SCRAPY MEMORY DEBUGGER RESULTS\n\n" for f in figures: s += "%-30s : %d %s\n" % f if settings.getbool('TRACK_REFS'): s += os.linesep s += format_live_refs() return s
def __init__(self): if not settings.getbool('MEMDEBUG_ENABLED'): raise NotConfigured self.mail = MailSender() self.rcpts = settings.getlist('MEMDEBUG_NOTIFY') dispatcher.connect(self.engine_started, signals.engine_started) dispatcher.connect(self.engine_stopped, signals.engine_stopped)
class CookiesMiddleware(object): """This middleware enables working with sites that need cookies""" debug = settings.getbool('COOKIES_DEBUG') def __init__(self): if not settings.getbool('COOKIES_ENABLED'): raise NotConfigured self.jars = defaultdict(CookieJar) def process_request(self, request, spider): if 'dont_merge_cookies' in request.meta: return cookiejarkey = request.meta.get("cookiejar") jar = self.jars[cookiejarkey] cookies = self._get_request_cookies(jar, request) for cookie in cookies: jar.set_cookie_if_ok(cookie, request) # set Cookie header request.headers.pop('Cookie', None) jar.add_cookie_header(request) self._debug_cookie(request, spider) def process_response(self, request, response, spider): if 'dont_merge_cookies' in request.meta: return response # extract cookies from Set-Cookie and drop invalid/expired cookies cookiejarkey = request.meta.get("cookiejar") jar = self.jars[cookiejarkey] jar.extract_cookies(response, request) self._debug_set_cookie(response, spider) return response def _debug_cookie(self, request, spider): if self.debug: cl = request.headers.getlist('Cookie') if cl: msg = "Sending cookies to: %s" % request + os.linesep msg += os.linesep.join("Cookie: %s" % c for c in cl) log.msg(msg, spider=spider, level=log.DEBUG) def _debug_set_cookie(self, response, spider): if self.debug: cl = response.headers.getlist('Set-Cookie') if cl: msg = "Received cookies from: %s" % response + os.linesep msg += os.linesep.join("Set-Cookie: %s" % c for c in cl) log.msg(msg, spider=spider, level=log.DEBUG) def _get_request_cookies(self, jar, request): headers = {'Set-Cookie': ['%s=%s;' % (k, v) for k, v in request.cookies.iteritems()]} response = Response(request.url, headers=headers) cookies = jar.make_cookies(response, request) return cookies
def __init__(self): if not settings.getbool('SPIDERPROFILER_ENABLED'): raise NotConfigured try: get_vmvalue_from_procfs('VmSize') except RuntimeError: self._mem_tracking = False else: self._mem_tracking = True dispatcher.connect(self._request_received, signals.request_received)
def __init__(self, stats, settings=settings): # Required settings self.S3_ACCESS_KEY = settings.get('AWS_ACCESS_KEY_ID') self.S3_SECRET_KEY = settings.get('AWS_SECRET_ACCESS_KEY') self.S3_CACHE_BUCKET = settings.get('HISTORY_S3_BUCKET') # Optional settings self.use_proxy = settings.getbool('HISTORY_USE_PROXY', True) self.SAVE_SOURCE = settings.get('HISTORY_SAVE_SOURCE') self.stats = stats
def _response_downloaded(self, response, callback, cb_kwargs, follow): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True): for request_or_item in self._requests_to_follow(response): yield request_or_item
def start(logfile=None, loglevel=None, logstdout=None): global started if started or not settings.getbool('LOG_ENABLED'): return started = True if log.defaultObserver: # check twisted log not already started loglevel = _get_log_level(loglevel) logfile = logfile or settings['LOG_FILE'] file = open(logfile, 'a') if logfile else sys.stderr if logstdout is None: logstdout = settings.getbool('LOG_STDOUT') sflo = ScrapyFileLogObserver(file, loglevel, settings['LOG_ENCODING']) _oldshowwarning = warnings.showwarning log.startLoggingWithObserver(sflo.emit, setStdout=logstdout) # restore warnings, wrongly silenced by Twisted warnings.showwarning = _oldshowwarning msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, \ settings['BOT_NAME']))
def __init__(self): super(SimpledbStatsCollector, self).__init__() self._sdbdomain = settings['STATS_SDB_DOMAIN'] self._access_key = settings['AWS_ACCESS_KEY_ID'] self._secret_key = settings['AWS_SECRET_ACCESS_KEY'] self._async = settings.getbool('STATS_SDB_ASYNC') import boto self.connect_sdb = boto.connect_sdb self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain(self._sdbdomain)
def __init__(self): try: import libxml2 self.libxml2 = libxml2 except ImportError: self.libxml2 = None if not settings.getbool('MEMDEBUG_ENABLED'): raise NotConfigured dispatcher.connect(self.engine_started, signals.engine_started) dispatcher.connect(self.engine_stopped, signals.engine_stopped)
def engine_stopped(self): if self.libxml2: self.libxml2.cleanupParser() stats.set_value('memdebug/libxml2_leaked_bytes', self.libxml2.debugMemory(1)) gc.collect() stats.set_value('memdebug/gc_garbage_count', len(gc.garbage)) if settings.getbool('TRACK_REFS'): for cls, wdict in live_refs.iteritems(): if not wdict: continue stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict))
def __init__(self): super(SimpledbStatsCollector, self).__init__() self._sdbdomain = settings['STATS_SDB_DOMAIN'] self._access_key = settings['AWS_ACCESS_KEY_ID'] self._secret_key = settings['AWS_SECRET_ACCESS_KEY'] self._async = settings.getbool('STATS_SDB_ASYNC') import boto self.connect_sdb = boto.connect_sdb self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain( self._sdbdomain)
def __init__(self): if not settings.getbool("GROUPSETTINGS_ENABLED"): raise NotConfigured if command_executed and command_executed["name"] == "crawl": mod = __import__(settings["GROUPSETTINGS_MODULE"], {}, {}, [""]) args = command_executed["args"] if len(args) == 1 and not args[0].startswith("http://"): domain = args[0] settings.overrides.update(mod.default_settings) for group, domains in mod.group_spiders.iteritems(): if domain in domains: settings.overrides.update(mod.group_settings.get(group, {}))
def __init__(self): try: import libxml2 self.libxml2 = libxml2 except ImportError: self.libxml2 = None if not settings.getbool('MEMDEBUG_ENABLED'): raise NotConfigured self.mail = MailSender() self.rcpts = settings.getlist('MEMDEBUG_NOTIFY') dispatcher.connect(self.engine_started, signals.engine_started) dispatcher.connect(self.engine_stopped, signals.engine_stopped)
def __init__(self): if not settings.getbool('WEBSERVICE_ENABLED'): raise NotConfigured logfile = settings['WEBSERVICE_LOGFILE'] port = settings.getint('WEBSERVICE_PORT') root = RootResource() reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \ settings['WEBSERVICE_RESOURCES']) for res_cls in map(load_object, reslist): res = res_cls() root.putChild(res.ws_name, res) server.Site.__init__(self, root, logPath=logfile) self.noisy = False reactor.callWhenRunning(reactor.listenTCP, port, self)
def _response_downloaded(self, response, callback, cb_kwargs, follow): """ This is were any response arrives, and were it's decided whether to extract links or not from it, and if it will be parsed or not. It returns a list of requests/items. """ if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and settings.getbool("CRAWLSPIDER_FOLLOW_LINKS", True): for request_or_item in self._requests_to_follow(response): yield request_or_item
def __init__(self, settings): self.settings = settings self.urifmt = settings['FEED_URI'] if not self.urifmt: raise NotConfigured self.format = settings['FEED_FORMAT'].lower() self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') if not self._storage_supported(self.urifmt): raise NotConfigured if not self._exporter_supported(self.format): raise NotConfigured self.store_empty = settings.getbool('FEED_STORE_EMPTY') uripar = settings['FEED_URI_PARAMS'] self._uripar = load_object(uripar) if uripar else lambda x, y: None
def __init__(self): if not settings.getbool('WEBSERVICE_ENABLED'): raise NotConfigured logfile = settings['WEBSERVICE_LOGFILE'] self.portrange = map(int, settings.getlist('WEBSERVICE_PORT')) self.host = settings['WEBSERVICE_HOST'] root = RootResource() reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \ settings['WEBSERVICE_RESOURCES']) for res_cls in map(load_object, reslist): res = res_cls() root.putChild(res.ws_name, res) server.Site.__init__(self, root, logPath=logfile) self.noisy = False dispatcher.connect(self.start_listening, signals.engine_started) dispatcher.connect(self.stop_listening, signals.engine_stopped)
def __init__(self): if settings.getbool('GIT_CACHE_ENABLED'): cachedir = settings['HTTPCACHE_DIR'] if os.path.exists(cachedir): self.work_tree = cachedir else: self.work_tree = os.path.join(os.path.dirname(settings['PROJECT_ROOT']), '.scrapy', cachedir) self.basecmd = [ 'git', '--git-dir=%s' % os.path.join(self.work_tree, '.git'), '--work-tree=%s' % self.work_tree, ] dispatcher.connect(self.engine_started, signal=signals.engine_started) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self): self.urifmt = settings['FEED_URI'] if not self.urifmt: raise NotConfigured self.format = settings['FEED_FORMAT'].lower() self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') if not self._storage_supported(self.urifmt): raise NotConfigured if not self._exporter_supported(self.format): raise NotConfigured self.store_empty = settings.getbool('FEED_STORE_EMPTY') uripar = settings['FEED_URI_PARAMS'] self._uripar = load_object(uripar) if uripar else lambda x, y: None self.slots = {} dispatcher.connect(self.open_spider, signals.spider_opened) dispatcher.connect(self.close_spider, signals.spider_closed) dispatcher.connect(self.item_passed, signals.item_passed)
def __init__(self): self.urifmt = settings['FEED_URI'] if not self.urifmt: raise NotConfigured self.format = settings['FEED_FORMAT'].lower() self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') if not self._storage_supported(self.urifmt): raise NotConfigured if not self._exporter_supported(self.format): raise NotConfigured self.store_empty = settings.getbool('FEED_STORE_EMPTY') uripar = settings['FEED_URI_PARAMS'] self._uripar = load_object(uripar) if uripar else lambda x, y: None self.slots = {} dispatcher.connect(self.open_spider, signals.spider_opened) dispatcher.connect(self.close_spider, signals.spider_closed) dispatcher.connect(self.item_scraped, signals.item_scraped)
def __init__(self, settings=settings): history = settings.get("HISTORY", None) if not history: raise NotConfigured() # EPOCH: # == False: don't retrieve historical data # == True : retrieve most recent version # == datetime(): retrieve next version after datetime() self.epoch = self.parse_epoch(settings.get("EPOCH", False)) self.retrieve_if = load_object(history.get("RETRIEVE_IF", "history.logic.RetrieveNever"))(settings) self.store_if = load_object(history.get("STORE_IF", "history.logic.StoreAlways"))(settings) self.storage = load_object(history.get("BACKEND", "history.storage.S3CacheStorage"))(settings) self.ignore_missing = settings.getbool("HTTPCACHE_IGNORE_MISSING") dispatcher.connect(self.spider_opened, signal=signals.spider_opened) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self): if settings.getbool('GIT_CACHE_ENABLED'): cachedir = settings['HTTPCACHE_DIR'] if os.path.exists(cachedir): self.work_tree = cachedir else: self.work_tree = os.path.join( os.path.dirname(settings['PROJECT_ROOT']), '.scrapy', cachedir) self.basecmd = [ 'git', '--git-dir=%s' % os.path.join(self.work_tree, '.git'), '--work-tree=%s' % self.work_tree, ] dispatcher.connect(self.engine_started, signal=signals.engine_started) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self, settings): self.settings = settings self.urifmt = settings['FEED_URI'] if not self.urifmt: raise NotConfigured self.format = settings['FEED_FORMAT'].lower() self.export_encoding = settings['FEED_EXPORT_ENCODING'] self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') if not self._storage_supported(self.urifmt): raise NotConfigured if not self._exporter_supported(self.format): raise NotConfigured self.store_empty = settings.getbool('FEED_STORE_EMPTY') self._exporting = False self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None self.indent = None if settings.get('FEED_EXPORT_INDENT') is not None: self.indent = settings.getint('FEED_EXPORT_INDENT') uripar = settings['FEED_URI_PARAMS'] self._uripar = load_object(uripar) if uripar else lambda x, y: None
def send(self, to, subject, body, cc=None, attachs=()): if attachs: msg = MIMEMultipart() else: msg = MIMENonMultipart('text', 'plain') msg['From'] = self.mailfrom msg['To'] = COMMASPACE.join(to) msg['Date'] = formatdate(localtime=True) msg['Subject'] = subject rcpts = to[:] if cc: rcpts.extend(cc) msg['Cc'] = COMMASPACE.join(cc) if attachs: msg.attach(MIMEText(body)) for attach_name, mimetype, f in attachs: part = MIMEBase(*mimetype.split('/')) part.set_payload(f.read()) Encoders.encode_base64(part) part.add_header('Content-Disposition', 'attachment; filename="%s"' \ % attach_name) msg.attach(part) else: msg.set_payload(body) send_catch_log(signal=mail_sent, to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg) if settings.getbool('MAIL_DEBUG'): log.msg('Debug mail sent OK: To=%s Cc=%s Subject="%s" Attachs=%d' % \ (to, cc, subject, len(attachs)), level=log.DEBUG) return dfd = self._sendmail(rcpts, msg.as_string()) dfd.addCallbacks(self._sent_ok, self._sent_failed, callbackArgs=[to, cc, subject, len(attachs)], errbackArgs=[to, cc, subject, len(attachs)]) reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd) return dfd
def __init__(self, download_delay=None, max_concurrent_requests=None): if download_delay is None: self._download_delay = settings.getfloat("DOWNLOAD_DELAY") else: self._download_delay = float(download_delay) if self._download_delay: self.max_concurrent_requests = 1 elif max_concurrent_requests is None: self.max_concurrent_requests = settings.getint("CONCURRENT_REQUESTS_PER_SPIDER") else: self.max_concurrent_requests = max_concurrent_requests if self._download_delay and settings.getbool("RANDOMIZE_DOWNLOAD_DELAY"): # same policy as wget --random-wait self.random_delay_interval = (0.5 * self._download_delay, 1.5 * self._download_delay) else: self.random_delay_interval = None self.active = set() self.queue = [] self.transferring = set() self.closing = False self.lastseen = 0 self.next_request_calls = set()
def __init__(self): if not settings.getbool('COOKIES_ENABLED'): raise NotConfigured self.jars = defaultdict(CookieJar) dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self): self._dump = settings.getbool('STATS_DUMP') self._stats = {None: {}} # None is for global stats
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import Selector from scrapy.http import Request #from scrapy.http import FormRequest from scrapy.conf import settings #from scrapy.shell import inspect_response from scrapy.utils.response import get_base_url from tase.HistorySpider import HistorySpider from tase.items import TaseItem import tase.common PROCESS_HISTORY = settings.getbool('PROCESS_HISTORY', False) HISTORY_PERIOD = settings.getint('HISTORY_PERIOD', 2) category_fund = settings.get('CATEGORY_FUND') class FundSpider(HistorySpider): name = 'funds' allowed_domains = ['tase.co.il'] start_urls = ['http://www.tase.co.il/TASEEng/MarketData/MutualFunds/'] rules = ( Rule(SgmlLinkExtractor(allow=[r'BuildCmb_6_1.js']), callback='parse_fund_list'), Rule(SgmlLinkExtractor(allow=('FundMainData\.htm', )), callback='parse_fund'), Rule(SgmlLinkExtractor(allow=(r'MutualFunds', )),
def __init__(self): if not settings.getbool('COOKIES_ENABLED'): raise NotConfigured self.jars = defaultdict(CookieJar)
from tase.items import TaseItem from tase.items import FinancialStatement import urllib from urlparse import urlparse from urlparse import parse_qs from urlparse import urljoin import tase.common from html2text import html2text #import string #import random PROCESS_HISTORY = settings.getbool('PROCESS_HISTORY', False) HISTORY_PERIOD = settings.getint('HISTORY_PERIOD', 2) # 1 month category_comp = settings.get('CATEGORY_COMP') PROCESS_FINANCIAL_STATEMENTS = settings.getbool('PROCESS_FINANCIAL_STATEMENTS', False) class StockSpider(HistorySpider): name = 'stocks' allowed_domains = ['tase.co.il'] start_urls = [ 'http://www.tase.co.il/eng/marketdata/stocks/marketdata/Pages/MarketData.aspx' ] rules = ( Rule(SgmlLinkExtractor(allow=('MarketData\.aspx', )),
class CookiesMiddleware(object): """This middleware enables working with sites that need cookies""" debug = settings.getbool('COOKIES_DEBUG') def __init__(self): self.jars = defaultdict(CookieJar) dispatcher.connect(self.spider_closed, signals.spider_closed) def process_request(self, request, spider): if 'dont_merge_cookies' in request.meta: return jar = self.jars[spider] cookies = self._get_request_cookies(jar, request) for cookie in cookies: jar.set_cookie_if_ok(cookie, request) # set Cookie header request.headers.pop('Cookie', None) jar.add_cookie_header(request) self._debug_cookie(request) def process_response(self, request, response, spider): if 'dont_merge_cookies' in request.meta: return response # extract cookies from Set-Cookie and drop invalid/expired cookies jar = self.jars[spider] jar.extract_cookies(response, request) self._debug_set_cookie(response) return response def spider_closed(self, spider): self.jars.pop(spider, None) def _debug_cookie(self, request): """log Cookie header for request""" if self.debug: c = request.headers.get('Cookie') c = c and [p.split('=')[0] for p in c.split(';')] log.msg('Cookie: %s for %s' % (c, request.url), level=log.DEBUG) def _debug_set_cookie(self, response): """log Set-Cookies headers but exclude cookie values""" if self.debug: cl = response.headers.getlist('Set-Cookie') res = [] for c in cl: kv, tail = c.split(';', 1) k = kv.split('=', 1)[0] res.append('%s %s' % (k, tail)) log.msg('Set-Cookie: %s from %s' % (res, response.url)) def _get_request_cookies(self, jar, request): headers = { 'Set-Cookie': ['%s=%s;' % (k, v) for k, v in request.cookies.iteritems()] } response = Response(request.url, headers=headers) cookies = jar.make_cookies(response, request) return cookies
from scrapy.selector import Selector from scrapy.http import Request from scrapy.http import FormRequest from scrapy.conf import settings #from scrapy.shell import inspect_response from scrapy import log from tase.items import NewsArticle import urllib from urlparse import urlparse from urlparse import parse_qs #from urlparse import urljoin import tase.common PROCESS_NEWS = settings.getbool('PROCESS_NEWS', False) PROCESS_NEWS_HISTORY = settings.getbool('PROCESS_NEWS_HISTORY', False) PROCESS_NEWS_CONTENT = settings.getbool('PROCESS_NEWS_CONTENT', True) class NewsSpider(CrawlSpider): name = 'news' allowed_domains = ['globes.co.il'] start_urls = [] rules = ( #Rule(SgmlLinkExtractor(allow=(r'searchajax\.aspx\?',)), callback='parse_article_list'), Rule(SgmlLinkExtractor(allow=('searchajax\.aspx', )), callback='parse_article_list'), #Rule(SgmlLinkExtractor(allow=('\/en\/article-',)), callback='parse_article'), )
def __init__(self): self._dump = settings.getbool('STATS_DUMP') self._stats = {None: {}} # None is for global stats dispatcher.connect(self._engine_stopped, signal=signals.engine_stopped)
from scrapy.statscol import DummyStatsCollector from scrapy.conf import settings from scrapy.utils.misc import load_object # if stats are disabled use a DummyStatsCollector to improve performance if settings.getbool('STATS_ENABLED'): stats = load_object(settings['STATS_CLASS'])() else: stats = DummyStatsCollector()
def __init__(self, *args, **kwargs): kwargs['delimiter'] = settings.get('EXPORT_CSV_DELIMITER', '\001') kwargs['fields_to_export'] = settings.getlist('EXPORT_FIELDS') or None kwargs['encoding'] = settings.getlist('EXPORT_ENCODING', 'utf-8') super(AppinfoCsvExporter, self).__init__(*args, **kwargs) self.include_headers_line = settings.getbool('export_csv_headers', true);