def __init__(self): if not settings.getbool('REDIRECT_ENABLED'): raise NotConfigured self.max_metarefresh_delay = settings.getint( 'REDIRECT_MAX_METAREFRESH_DELAY') self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES') self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
def from_settings(cls, settings): cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0) cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0) cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90) cls.THUMBS = settings.get('IMAGES_THUMBS', {}) cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD) cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD) store_uri = settings['IMAGES_STORE'] return cls(store_uri)
def from_settings(cls, settings): cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0) cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0) cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90) cls.THUMBS = {} s3store = cls.STORE_SCHEMES['s3'] s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID'] s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] store_uri = settings['IMAGES_STORE'] return cls(store_uri)
def _print_setting(self, opts): if opts.get: print settings_.get(opts.get) elif opts.getbool: print settings_.getbool(opts.getbool) elif opts.getint: print settings_.getint(opts.getint) elif opts.getfloat: print settings_.getfloat(opts.getfloat) elif opts.getlist: print settings_.getlist(opts.getlist)
def __init__(self): self.timeout = settings.getint('CLOSESPIDER_TIMEOUT') self.itempassed = settings.getint('CLOSESPIDER_ITEMPASSED') self.counts = defaultdict(int) self.tasks = {} if self.timeout: dispatcher.connect(self.spider_opened, signal=signals.spider_opened) if self.itempassed: dispatcher.connect(self.item_passed, signal=signals.item_passed) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self): if not settings.getbool("MEMUSAGE_ENABLED"): raise NotConfigured if not procfs_supported(): raise NotConfigured self.warned = False self.notify_mails = settings.getlist("MEMUSAGE_NOTIFY") self.limit = settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024 self.warning = settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024 self.report = settings.getbool("MEMUSAGE_REPORT") self.mail = MailSender() dispatcher.connect(self.engine_started, signal=signals.engine_started) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self): if not settings.getbool('MEMUSAGE_ENABLED'): raise NotConfigured if not os.path.exists('/proc'): raise NotConfigured self.warned = False self.notify_mails = settings.getlist('MEMUSAGE_NOTIFY') self.limit = settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024 self.warning = settings.getint('MEMUSAGE_WARNING_MB')*1024*1024 self.report = settings.getbool('MEMUSAGE_REPORT') self.mail = MailSender() dispatcher.connect(self.engine_started, signal=signals.engine_started) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self): if not settings.getbool('MEMUSAGE_ENABLED'): raise NotConfigured if not procfs_supported(): raise NotConfigured self.warned = False self.notify_mails = settings.getlist('MEMUSAGE_NOTIFY') self.limit = settings.getint('MEMUSAGE_LIMIT_MB') * 1024 * 1024 self.warning = settings.getint('MEMUSAGE_WARNING_MB') * 1024 * 1024 self.report = settings.getbool('MEMUSAGE_REPORT') self.mail = MailSender() dispatcher.connect(self.engine_started, signal=signals.engine_started) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self, settings): # 保存上次不用代理直接连接的时间点 self.last_no_proxy_time = datetime.now() # 一定分钟数后切换回不用代理, 因为用代理影响到速度 self.recover_interval = 20 # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件. self.dump_count_threshold = 20 # 是否在超时的情况下禁用代理 self.invalid_proxy_flag = True # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数 # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码. # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban, # 这样整个爬虫就会处于一种忙等待的状态, 影响效率 self.extend_proxy_threshold = 5 # 初始化代理列表 self.proxyes = [{"proxy": None, "valid": True, "count": 0}] # 初始时使用0号代理(即无代理) self.proxy_index = 0 # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接) self.fixed_proxy = len(self.proxyes) # 上一次抓新代理的时间 self.last_fetch_proxy_time = datetime.now() # 每隔固定时间强制抓取新代理(min) self.fetch_proxy_interval = 120 # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid self.invalid_proxy_threshold = 200 # 在开始执行爬虫时,先另起线程去抓代理ip self.threadLock = threading.Lock() self.proxysStatus = 0 # 0:未爬取代理,1:正在爬取代理,2:已经抓完代理ip self.max_retry_times = settings.getint('RETRY_TIMES') # 有fail_count_threadhold次爬取失败就移除该代理 self.fail_count_threadhold = 3
def __init__(self): if not settings.getbool('TELNETCONSOLE_ENABLED'): raise NotConfigured self.protocol = makeProtocol self.noisy = False port = settings.getint('TELNETCONSOLE_PORT') reactor.callWhenRunning(reactor.listenTCP, port, self)
def __init__(self): self.delay = settings.getint('SPIDER_CLOSE_DELAY') if not self.delay: raise NotConfigured self.opened_at = defaultdict(time) dispatcher.connect(self.spider_idle, signal=signals.spider_idle) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self): if not settings.getbool('WEBCONSOLE_ENABLED'): raise NotConfigured logfile = settings['WEBCONSOLE_LOGFILE'] server.Site.__init__(self, WebConsoleResource(), logPath=logfile) self.noisy = False port = settings.getint('WEBCONSOLE_PORT') reactor.callWhenRunning(reactor.listenTCP, port, self)
def __init__(self, smtphost=None, mailfrom=None, smtpuser=None, smtppass=None, \ smtpport=None): self.smtphost = smtphost or settings['MAIL_HOST'] self.smtpport = smtpport or settings.getint('MAIL_PORT') self.smtpuser = smtpuser or settings['MAIL_USER'] self.smtppass = smtppass or settings['MAIL_PASS'] self.mailfrom = mailfrom or settings['MAIL_FROM'] if not self.smtphost or not self.mailfrom: raise NotConfigured("MAIL_HOST and MAIL_FROM settings are required")
def __init__(self): self.max_queue_size = settings.getint("REQUESTS_QUEUE_SIZE") if not self.max_queue_size: raise NotConfigured self.max_pending = {} self.dropped_count = {} dispatcher.connect(self.spider_opened, signal=signals.spider_opened) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self): self.timeout = settings.getint('CLOSESPIDER_TIMEOUT') self.itempassed = settings.getint('CLOSESPIDER_ITEMPASSED') self.pagecount = settings.getint('CLOSESPIDER_PAGECOUNT') self.errorcount = settings.getint('CLOSESPIDER_ERRORCOUNT') self.errorcounts = defaultdict(int) self.pagecounts = defaultdict(int) self.counts = defaultdict(int) self.tasks = {} if self.errorcount: txlog.addObserver(self.catch_log) if self.pagecount: dispatcher.connect(self.page_count, signal=signals.response_received) if self.timeout: dispatcher.connect(self.spider_opened, signal=signals.spider_opened) if self.itempassed: dispatcher.connect(self.item_passed, signal=signals.item_passed) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self): try: from redis import Redis except ImportError: raise NotConfigured # get settings queue = settings.get('REDIS_QUEUE') if queue is None: raise NotConfigured host = settings.get('REDIS_HOST', 'localhost') port = settings.getint('REDIS_PORT', 6379) db = settings.getint('REDIS_DB', 0) password = settings.get('REDIS_PASSWORD') self.redis = Redis(host=host, port=port, db=db, password=password) self.queue = queue dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def wrapper(*args, **kwargs): max_attempts = settings.getint("MAX_MONGO_RECONNECT_ATTEMPTS", MAX_AUTO_RECONNECT_ATTEMPTS) mail = MailSender() for attempt in xrange(max_attempts): try: return mongo_op_func(*args, **kwargs) except AutoReconnect as e: wait_t = 1 + attempt # exponential back off log.msg("PyMongo auto-reconnecting... %s. Waiting %.1f seconds."%(str(e), wait_t), log.INFO) mail.send(to=[settings.get('MAIL_TO')], subject='PyMongo auto-reconnecting....', \ body="%s\n%s"%(e, traceback.format_exc())) time.sleep(wait_t)
def __init__(self, smtphost=None, mailfrom=None, smtpuser=None, smtppass=None, \ smtpport=None, debug=False): self.smtphost = smtphost or settings['MAIL_HOST'] self.smtpport = smtpport or settings.getint('MAIL_PORT') self.smtpuser = smtpuser or settings['MAIL_USER'] self.smtppass = smtppass or settings['MAIL_PASS'] self.mailfrom = mailfrom or settings['MAIL_FROM'] self.debug = debug if not self.smtphost or not self.mailfrom: raise NotConfigured( "MAIL_HOST and MAIL_FROM settings are required")
def __init__(self, crawler): self.crawler = crawler self.timeout = settings.getint('CLOSESPIDER_TIMEOUT') self.itemcount = settings.getint('CLOSESPIDER_ITEMCOUNT') # XXX: legacy support - remove for future releases if settings.getint('CLOSESPIDER_ITEMPASSED'): warnings.warn( "CLOSESPIDER_ITEMPASSED setting is deprecated, use CLOSESPIDER_ITEMCOUNT instead", ScrapyDeprecationWarning) self.pagecount = settings.getint('CLOSESPIDER_ITEMPASSED') self.pagecount = settings.getint('CLOSESPIDER_PAGECOUNT') self.errorcount = settings.getint('CLOSESPIDER_ERRORCOUNT') self.errorcounts = defaultdict(int) self.pagecounts = defaultdict(int) self.counts = defaultdict(int) self.tasks = {} if self.errorcount: txlog.addObserver(self.catch_log) if self.pagecount: dispatcher.connect(self.page_count, signal=signals.response_received) if self.timeout: dispatcher.connect(self.spider_opened, signal=signals.spider_opened) if self.itemcount: dispatcher.connect(self.item_scraped, signal=signals.item_scraped) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self, crawler): self.crawler = crawler self.timeout = settings.getint('CLOSESPIDER_TIMEOUT') self.itemcount = settings.getint('CLOSESPIDER_ITEMCOUNT') # XXX: legacy support - remove for future releases if settings.getint('CLOSESPIDER_ITEMPASSED'): warnings.warn("CLOSESPIDER_ITEMPASSED setting is deprecated, use CLOSESPIDER_ITEMCOUNT instead", ScrapyDeprecationWarning) self.pagecount = settings.getint('CLOSESPIDER_ITEMPASSED') self.pagecount = settings.getint('CLOSESPIDER_PAGECOUNT') self.errorcount = settings.getint('CLOSESPIDER_ERRORCOUNT') self.errorcounts = defaultdict(int) self.pagecounts = defaultdict(int) self.counts = defaultdict(int) self.tasks = {} if self.errorcount: txlog.addObserver(self.catch_log) if self.pagecount: dispatcher.connect(self.page_count, signal=signals.response_received) if self.timeout: dispatcher.connect(self.spider_opened, signal=signals.spider_opened) if self.itemcount: dispatcher.connect(self.item_scraped, signal=signals.item_scraped) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self): if not settings.getbool('WEBSERVICE_ENABLED'): raise NotConfigured logfile = settings['WEBSERVICE_LOGFILE'] port = settings.getint('WEBSERVICE_PORT') root = RootResource() reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \ settings['WEBSERVICE_RESOURCES']) for res_cls in map(load_object, reslist): res = res_cls() root.putChild(res.ws_name, res) server.Site.__init__(self, root, logPath=logfile) self.noisy = False reactor.callWhenRunning(reactor.listenTCP, port, self)
def __init__(self, settings): self.settings = settings self.urifmt = settings['FEED_URI'] if not self.urifmt: raise NotConfigured self.format = settings['FEED_FORMAT'].lower() self.export_encoding = settings['FEED_EXPORT_ENCODING'] self.storages = self._load_components('FEED_STORAGES') self.exporters = self._load_components('FEED_EXPORTERS') if not self._storage_supported(self.urifmt): raise NotConfigured if not self._exporter_supported(self.format): raise NotConfigured self.store_empty = settings.getbool('FEED_STORE_EMPTY') self._exporting = False self.export_fields = settings.getlist('FEED_EXPORT_FIELDS') or None self.indent = None if settings.get('FEED_EXPORT_INDENT') is not None: self.indent = settings.getint('FEED_EXPORT_INDENT') uripar = settings['FEED_URI_PARAMS'] self._uripar = load_object(uripar) if uripar else lambda x, y: None
def __init__(self, download_delay=None, max_concurrent_requests=None): if download_delay is None: self._download_delay = settings.getfloat("DOWNLOAD_DELAY") else: self._download_delay = float(download_delay) if self._download_delay: self.max_concurrent_requests = 1 elif max_concurrent_requests is None: self.max_concurrent_requests = settings.getint("CONCURRENT_REQUESTS_PER_SPIDER") else: self.max_concurrent_requests = max_concurrent_requests if self._download_delay and settings.getbool("RANDOMIZE_DOWNLOAD_DELAY"): # same policy as wget --random-wait self.random_delay_interval = (0.5 * self._download_delay, 1.5 * self._download_delay) else: self.random_delay_interval = None self.active = set() self.queue = [] self.transferring = set() self.closing = False self.lastseen = 0 self.next_request_calls = set()
def __init__(self): self.maxdepth = settings.getint('DEPTH_LIMIT') self.stats = settings.getbool('DEPTH_STATS') if self.stats and self.maxdepth: stats.set_value('envinfo/request_depth_limit', self.maxdepth)
def __init__(self): if not settings.getbool('REDIRECT_ENABLED'): raise NotConfigured self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY') self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES') self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
def __init__(self): self.maxlength = settings.getint('URLLENGTH_LIMIT') if not self.maxlength: raise NotConfigured
def __init__(self, engine): self.sites = {} self.spidermw = SpiderMiddlewareManager() self.itemproc = load_object(settings['ITEM_PROCESSOR'])() self.concurrent_items = settings.getint('CONCURRENT_ITEMS') self.engine = engine
def __init__(self): self.max_retry_times = settings.getint('RETRY_TIMES') self.retry_http_codes = map(int, settings.getlist('RETRY_HTTP_CODES')) self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
def parse_weibo(self, response): query = response.request.meta['query'] start = datetime.strptime(response.request.meta['start'], "%Y-%m-%d %H:%M:%S") end = datetime.strptime(response.request.meta['end'], "%Y-%m-%d %H:%M:%S") range = daterange(start, end).delta() last_fetched = datetime.strptime(response.request.meta['last_fetched'], "%Y-%m-%d %H:%M:%S") jQuery = pq(response.body) scripts = jQuery('script') text = "".join(filter(lambda x: x is not None, [x.text for x in scripts])) # check if we exceed the sina limit sassfilter_match = re.search(r'{(\"pid\":\"pl_common_sassfilter\".*?)}', text, re.M | re.I) if sassfilter_match: raise CloseSpider('weibo search exceeded') # check the num of search results totalshow_match = re.search(r'{(\"pid\":\"pl_common_totalshow\".*?)}', text, re.M | re.I) if totalshow_match: html = json.loads(totalshow_match.group())['html'] if len(html) == 0: raise CloseSpider('not login? %s' % html) totalshow = pq(html) if totalshow('div.topcon_l').html() is None: log.msg('%s 0 feeds' % query, level=log.INFO) return topcon_num = int(re.search('\s(\d+)\s', totalshow('div.topcon_l').text().replace(',', ''), re.I).group(1)) log.msg('%s %d feeds' % (query, topcon_num), level=log.INFO) max_feeds = settings.getint('FEED_LIMIT', 200000) if topcon_num > max_feeds: log.msg('too much (%d) result for %s.' % (topcon_num, query), logLevel=log.WARNING) elif 1000 < topcon_num < max_feeds: # weibo search only allow 20 feeds on 1 page and at most 50 pages. days = range.days / float(2) middle = start + timedelta(days) # first part url = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), start, middle) request = Request(url=url, callback=self.parse_weibo) request.meta['query'] = query request.meta['start'] = start.strftime("%Y-%m-%d %H:%M:%S") request.meta['end'] = middle.strftime("%Y-%m-%d %H:%M:%S") request.meta['priority'] = days / 2 request.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S") yield request # second part url2 = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), middle, end) request2 = Request(url=url2, callback=self.parse_weibo) request2.meta['query'] = query request2.meta['start'] = middle.strftime("%Y-%m-%d %H:%M:%S") request2.meta['end'] = end.strftime("%Y-%m-%d %H:%M:%S") request2.meta['priority'] = days / 2 request2.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S") yield request2 else: # check the feeds update feedlist_match = re.search(r'{(\"pid\":\"pl_weibo_feedlist\".*?)}', text, re.M | re.I) if feedlist_match: search_results = pq(json.loads(feedlist_match.group())['html']) feeds = search_results('dl.feed_list') search_pages = search_results('ul.search_page_M') pages = SearchPage.wrap(search_pages) # send the items to pipeline for feed in feeds: item = ScrapyWeiboItem() item['html'] = tostring(feed) yield item # skip first page and request other pages for i in xrange(2, len(pages)): query = pages[i] log.msg('%s' % query) request = Request(url=query, callback=self.parse_page) request.meta['query'] = query yield request
def __init__(self): self.sites = {} self.handlers = DownloadHandlers() self.middleware = DownloaderMiddlewareManager.from_settings(settings) self.concurrent_spiders = settings.getint('CONCURRENT_SPIDERS')
def __init__(self): self.sites = {} self.middleware = DownloaderMiddlewareManager() self.concurrent_spiders = settings.getint("CONCURRENT_SPIDERS")
from scrapy.selector import Selector from scrapy.http import Request #from scrapy.http import FormRequest from scrapy.conf import settings #from scrapy.shell import inspect_response from scrapy.utils.response import get_base_url from tase.HistorySpider import HistorySpider from tase.items import TaseItem import tase.common PROCESS_HISTORY = settings.getbool('PROCESS_HISTORY', False) HISTORY_PERIOD = settings.getint('HISTORY_PERIOD', 2) category_fund = settings.get('CATEGORY_FUND') class FundSpider(HistorySpider): name = 'funds' allowed_domains = ['tase.co.il'] start_urls = ['http://www.tase.co.il/TASEEng/MarketData/MutualFunds/'] rules = ( Rule(SgmlLinkExtractor(allow=[r'BuildCmb_6_1.js']), callback='parse_fund_list'), Rule(SgmlLinkExtractor(allow=('FundMainData\.htm', )), callback='parse_fund'), Rule(SgmlLinkExtractor(allow=(r'MutualFunds', )), callback='parse_fund_search'),
from __future__ import with_statement import cPickle as pickle from scrapy.xlib.pydispatch import dispatcher from scrapy.core.engine import scrapyengine from scrapy.core.exceptions import NotConfigured from scrapy.core import signals from scrapy.utils.response import response_httprepr from scrapy.stats import stats from scrapy.http import Request from scrapy import log from scrapy.conf import settings items_per_spider = settings.getint("ITEMSAMPLER_COUNT", 1) close_spider = settings.getbool("ITEMSAMPLER_CLOSE_SPIDER", False) max_response_size = settings.getbool("ITEMSAMPLER_MAX_RESPONSE_SIZE") class ItemSamplerPipeline(object): def __init__(self): self.filename = settings["ITEMSAMPLER_FILE"] if not self.filename: raise NotConfigured self.items = {} self.spiders_count = 0 self.empty_domains = set() dispatcher.connect(self.spider_closed, signal=signals.spider_closed) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def __init__(self): if not settings.getbool('RETRY_ENABLED'): raise NotConfigured self.max_retry_times = settings.getint('RETRY_TIMES') self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
def __init__(self): self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY') self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES') self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
def __init__(self): self.max_metarefresh_delay = settings.getint( 'REDIRECT_MAX_METAREFRESH_DELAY') self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES') self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
from __future__ import with_statement import cPickle as pickle from scrapy.xlib.pydispatch import dispatcher from scrapy.core.manager import scrapymanager from scrapy.core.exceptions import NotConfigured from scrapy.core import signals from scrapy.utils.response import response_httprepr from scrapy.stats import stats from scrapy.http import Request from scrapy import log from scrapy.conf import settings items_per_spider = settings.getint('ITEMSAMPLER_COUNT', 1) close_spider = settings.getbool('ITEMSAMPLER_CLOSE_SPIDER', False) max_response_size = settings.getbool('ITEMSAMPLER_MAX_RESPONSE_SIZE', ) class ItemSamplerPipeline(object): def __init__(self): self.filename = settings['ITEMSAMPLER_FILE'] if not self.filename: raise NotConfigured self.items = {} self.spiders_count = 0 self.empty_spiders = set() dispatcher.connect(self.spider_closed, signal=signals.spider_closed) dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)