def __init__(self, spidercls, settings): if isinstance(settings, dict): settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) self.signals.connect(lambda: logging.root.removeHandler(handler), signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.spidercls.update_settings(self.settings) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def create_root(config): from scrapy import log from scrapy.settings import Settings from slyd.crawlerspec import (CrawlerSpecManager, create_crawler_spec_resource) from slyd.bot import create_bot_resource import slyd.settings from slyd.projects import ProjectsResource root = Resource() root.putChild("static", File(config['docroot'])) crawler_settings = Settings() crawler_settings.setmodule(slyd.settings) spec_manager = CrawlerSpecManager(crawler_settings) # add project management at /projects projects = ProjectsResource(crawler_settings) root.putChild('projects', projects) # add crawler at /projects/PROJECT_ID/bot log.msg("Slybot specs loading from %s/[PROJECT]" % spec_manager.basedir, level=log.DEBUG) projects.putChild("bot", create_bot_resource(spec_manager)) # add spec at /projects/PROJECT_ID/spec spec = create_crawler_spec_resource(spec_manager) projects.putChild("spec", spec) return root
def get_fetch(log=False): settings = Settings() settings.set('LOG_ENABLED', log) crawler_process = CrawlerProcess(settings) crawler = crawler_process.create_crawler() crawler_process.start_crawling() t = Thread(target=crawler_process.start_reactor) t.daemon = True t.start() shell = Shell(crawler) shell.code = 'adsf' import threading lock = threading.Lock() def fetch(url_or_request): lock.acquire() try: shell.fetch(url_or_request) response = shell.vars.get('response') return response finally: lock.release() return fetch
def test_from_settings_constructs_middleware_with_the_specified_settings(): settings = Settings() settings.set('HTML_STORAGE', {'test': 'settings'}) downloader = HtmlStorageMiddleware.from_settings(settings) assert_that(downloader.settings, is_({'test': 'settings'}))
def make_downloader(save_html_on_codes=[]): settings = Settings() settings.set('HTML_STORAGE', { 'gzip_output': True, 'save_html_on_codes': save_html_on_codes }) return HtmlStorageMiddleware(settings)
def get_project_settings(): scrapy_module = "uris.urispider.settings" settings = Settings() settings.setmodule(scrapy_module) return settings
def create_spec_manager(projects_dir=None): """Create a CrawlerSpecManager configured to use test settings""" crawler_settings = ScrapySettings() crawler_settings.setmodule(test_settings) projects_dir = projects_dir or test_settings.SPEC_DATA_DIR test_settings.SPEC_FACTORY['PARAMS']['location'] = projects_dir return SpecManager(crawler_settings)
def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) logging.root.addHandler(handler) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def test_autopromote_dicts(self): settings = Settings() mydict = settings.get('TEST_DICT') self.assertIsInstance(mydict, BaseSettings) self.assertIn('key', mydict) self.assertEqual(mydict['key'], 'val') self.assertEqual(mydict.getpriority('key'), 0)
def test_getdict_autodegrade_basesettings(self): settings = Settings() mydict = settings.getdict('TEST_DICT') self.assertIsInstance(mydict, dict) self.assertEqual(len(mydict), 1) self.assertIn('key', mydict) self.assertEqual(mydict['key'], 'val')
class FilesPipelineTestCaseCustomSettings(unittest.TestCase): def setUp(self): self.tempdir = mkdtemp() self.pipeline = FilesPipeline(self.tempdir) self.default_settings = Settings() def tearDown(self): rmtree(self.tempdir) def test_expires(self): another_pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir, 'FILES_EXPIRES': 42})) self.assertEqual(self.pipeline.expires, self.default_settings.getint('FILES_EXPIRES')) self.assertEqual(another_pipeline.expires, 42) def test_files_urls_field(self): another_pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir, 'FILES_URLS_FIELD': 'funny_field'})) self.assertEqual(self.pipeline.files_urls_field, self.default_settings.get('FILES_URLS_FIELD')) self.assertEqual(another_pipeline.files_urls_field, 'funny_field') def test_files_result_field(self): another_pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir, 'FILES_RESULT_FIELD': 'funny_field'})) self.assertEqual(self.pipeline.files_result_field, self.default_settings.get('FILES_RESULT_FIELD')) self.assertEqual(another_pipeline.files_result_field, 'funny_field')
def __init__(self, store_uri, download_func=None, settings=None): if not store_uri: raise NotConfigured if isinstance(settings, dict) or settings is None: settings = Settings(settings) cls_name = "FilesPipeline" self.store = self._get_store(store_uri) resolve = functools.partial(self._key_for_pipe, base_class_name=cls_name, settings=settings) self.expires = settings.getint( resolve('FILES_EXPIRES'), self.EXPIRES ) if not hasattr(self, "FILES_URLS_FIELD"): self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD if not hasattr(self, "FILES_RESULT_FIELD"): self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD self.files_urls_field = settings.get( resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD ) self.files_result_field = settings.get( resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD ) super(FilesPipeline, self).__init__(download_func=download_func)
def _crawl(self): settings = Settings() settings.set('ITEM_PIPELINES', { 'app.pipelines.JsonWriterPipeline': 300 }) self.process = CrawlerProcess(settings) self.process.crawl(self, self.name, self.start_urls) self.process.start()
def test_contructor_sets_default_settings_values_when_no_settings_are_specified( setting_name, expected): settings = Settings() settings.set('HTML_STORAGE', {}) downloader = HtmlStorageMiddleware(settings) assert_that(downloader.__dict__[setting_name], is_(expected))
def __init__(self, spider): Process.__init__(self) setting = Settings() setting.setmodule(s) self.crawler = Crawler(setting) self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider
def __init__(self, settings): if isinstance(settings, dict): settings = Settings(settings) self.settings = settings smcls = load_object(settings['SPIDER_MANAGER_CLASS']) verifyClass(ISpiderManager, smcls) self.spiders = smcls.from_settings(settings.frozencopy()) self.crawlers = set() self._active = set()
def runSpider(self, spider): configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'}) settings = Settings() settings.set('FEED_URI', 'output.json') settings.set('FEED_FORMAT', 'json') runner = CrawlerRunner(settings) dfd = runner.crawl(spider) dfd.addBoth(lambda _: reactor.stop())
def __init__(self): sets = Settings() sets.setmodule(settings, priority='project') connection = pymongo.MongoClient( sets['MONGODB_SERVER'], sets['MONGODB_PORT'] ) db = connection[sets['MONGODB_DB']] self.collection = db[sets['MONGODB_COLLECTION']]
def create_root(config, settings_module): from scrapy.settings import Settings from .specmanager import SpecManager from .authmanager import AuthManager from .projectspec import create_project_resource from slyd.bot import create_bot_resource from slyd.projects import create_projects_manager_resource from slyd.splash.ferry import (FerryServerProtocol, FerryServerFactory, create_ferry_resource) from slyd.splash.proxy import ProxyResource root = Resource() static = Resource() for file_name in listdir(config['docroot']): file_path = join(config['docroot'], file_name) if isfile(file_path): static.putChild(file_name, File(file_path)) static.putChild('main.html', File(join(config['docroot'], 'index.html'))) root.putChild('static', static) root.putChild('assets', File(join(config['docroot'], 'assets'))) root.putChild('fonts', File(join(config['docroot'], 'assets', 'fonts'))) root.putChild('', File(join(config['docroot'], 'index.html'))) settings = Settings() settings.setmodule(settings_module) spec_manager = SpecManager(settings) # add server capabilities at /server_capabilities capabilities = Capabilities(spec_manager) root.putChild('server_capabilities', capabilities) # add projects manager at /projects projects = create_projects_manager_resource(spec_manager) root.putChild('projects', projects) # add crawler at /projects/PROJECT_ID/bot projects.putChild('bot', create_bot_resource(spec_manager)) # add project spec at /projects/PROJECT_ID/spec spec = create_project_resource(spec_manager) projects.putChild('spec', spec) # add websockets for communicating with splash factory = FerryServerFactory("ws://127.0.0.1:%s" % config['port'], debug=False, assets=config['docroot']) factory.protocol = FerryServerProtocol factory.setProtocolOptions(allowHixie76=True) websocket = create_ferry_resource(spec_manager, factory) root.putChild("ws", websocket) root.putChild('proxy', ProxyResource()) auth_manager = AuthManager(settings) return auth_manager.protectResource(root)
def get_project_settings(module=None, custom_settings=None): crawler_settings = Settings() if module is None: module = settings.PROJECT_SETTINGS crawler_settings.setmodule(module, priority='project') if custom_settings: assert isinstance(custom_settings, dict) crawler_settings.setdict(custom_settings, priority='cmdline') return crawler_settings
def import_from_old(request): from scrapy.settings import Settings from scrapy.crawler import CrawlerProcess from yurasic_spider import SongSpider settings = Settings() settings.setmodule('yurasic_spider.settings', priority='project') crawler = CrawlerProcess(settings) crawler.crawl(SongSpider) crawler.start()
def test_scrapy_spider(): settings = Settings() settings.setmodule("tests.scrapy_spider.settings") crawler = Crawler(MySpider, settings=settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.crawl() reactor.run() stats = crawler.stats.spider_stats["example"] assert stats["frontera/crawled_pages_count"] == 5 assert crawler.spider.callback_calls > 0
def __init__(self, spider): Process.__init__(self) setting = Settings() setting.setmodule(settings,1) self.crawler = Crawler(setting) if not hasattr(project, 'crawler'): self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed) self.spider = spider
def test_update_settings(self): spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'} project_settings = {'TEST1': 'project', 'TEST3': 'project'} self.spider_class.custom_settings = spider_settings settings = Settings(project_settings, priority='project') self.spider_class.update_settings(settings) self.assertEqual(settings.get('TEST1'), 'spider') self.assertEqual(settings.get('TEST2'), 'spider') self.assertEqual(settings.get('TEST3'), 'project')
def _construct_scraper_settings(): """Construct settings for scraper. Method constructs settings from default scrapy settings and augments them from loaded `CONFIGS`. """ if "scraper" in CONFIGS.keys(): global SCRAPER_SETTINGS SCRAPER_SETTINGS = Settings() SCRAPER_SETTINGS.setdict(CONFIGS["scraper"])
def __init__(self): settings = Settings() settings.setdict({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'FEED_FORMAT': 'json', 'DOWNLOAD_DELAY': 2, 'REACTOR_THREADPOOL_MAXSIZE': 2, 'LOG_LEVEL': 'WARNING' }, priority='project') self.process = CrawlerProcess(settings=settings)
def create_spider(): custom_settings = Settings() custom_settings.setmodule(settings) crawler = Crawler( spidercls=desy_spider.DesySpider, settings=custom_settings, ) return desy_spider.DesySpider.from_crawler( crawler, source_folder='idontexist_but_it_does_not_matter', )
def start(self): settings = Settings() # crawl responsibly settings.set("USER_AGENT", "test") crawler_obj = Spider() crawler = Crawler(crawler_obj, settings) # stop reactor when spider closes crawler.signals.connect(self.stop, signal=signals.spider_closed) crawler.crawl()
def __init__(self, download_func=None, settings=None): self.download_func = download_func if isinstance(settings, dict) or settings is None: settings = Settings(settings) resolve = functools.partial(self._key_for_pipe, base_class_name="MediaPipeline", settings=settings) self.allow_redirects = settings.getbool( resolve('MEDIA_ALLOW_REDIRECTS'), False ) self._handle_statuses(self.allow_redirects)
def get_summaries_collection(): import pymongo from scraper.guardianukscraper import settings from scrapy.settings import Settings sets = Settings() sets.setmodule(settings, priority='project') connection = pymongo.MongoClient( sets['MONGODB_SERVER'], sets['MONGODB_PORT'] ) db = connection[sets['MONGODB_DB']] return db[sets['MONGODB_COLLECTION']]
def setUp(self): s3reqh = S3DownloadHandler(Settings(), self.AWS_ACCESS_KEY_ID, \ self.AWS_SECRET_ACCESS_KEY, \ httpdownloadhandler=HttpDownloadHandlerMock) self.download_request = s3reqh.download_request self.spider = Spider('foo')
from datetime import datetime from os.path import dirname, join import pytest from city_scrapers_core.constants import COMMISSION, PASSED from city_scrapers_core.utils import file_response from freezegun import freeze_time from scrapy.settings import Settings from city_scrapers.spiders.chi_board_elections import ChiBoardElectionsSpider test_response = file_response( join(dirname(__file__), "files", "chi_board_elections.html"), url='https://app.chicagoelections.com/pages/en/board-meetings.aspx') spider = ChiBoardElectionsSpider() spider.settings = Settings(values={"CITY_SCRAPERS_ARCHIVE": False}) freezer = freeze_time('2018-11-30') freezer.start() parsed_items = [item for item in spider._next_meeting(test_response)] freezer.stop() def test_title(): assert parsed_items[0]['title'] == 'Electoral Board' def test_description(): assert parsed_items[0]['description'] == ''
class FerryServerProtocol(WebSocketServerProtocol): _handlers = { 'load': 'load_page', 'interact': 'interact_page' } assets = './' settings = Settings() @property def tab(self): return self.factory[self].tab @property def spider(self): return self.factory[self].spider @property def spiderspec(self): return self.factory[self].spiderspec @property def user(self): return self.factory[self] def onConnect(self, request): try: auth_info = json.loads(request.headers['x-auth-info']) except (KeyError, TypeError): return self.session_id = '' self.auth_info = auth_info self.factory[self] = User(auth_info) def onOpen(self): if self not in self.factory: self.sendClose(1000, 'Invalid Connection missing required ' 'parameters') def onMessage(self, payload, isbinary): close_old_connections() payload = payload.decode('utf-8') data = json.loads(payload) project = data.get('project', data.get('_meta', {}).get('project')) storage = create_project_storage(project, author=self.user) projects = storage.__class__.get_projects(self.user) if project and str(project) not in projects: self.sendMessage({'status': 4004, 'message': 'Project not found'}) return deferred = defer.maybeDeferred( wrap_callback, None, self._on_message, storage=storage, data=data) deferred.addCallbacks(self.sendMessage, partial(self.send_error, data)) def _on_message(self, storage, data): if '_meta' in data and 'session_id' in data['_meta']: self.session_id = data['_meta']['session_id'] if is_blacklisted(data.get('url', '')): blacklist_error(data, self) return command = data['_command'] command = self._handlers.get(command, command) with data_store_context(): commands = Commands(data, self, storage) result = getattr(commands, command, lambda: None)() if result: result.setdefault('_command', data.get('_callback', command)) if '_meta' in data and 'id' in data['_meta']: result['id'] = data['_meta']['id'] return result def onClose(self, was_clean, code, reason): if self in self.factory: if self.tab is not None: self.tab.close() self.tab.network_manager.closed = True msg_data = {'session': self.session_id, 'session_time': 0, 'user': self.user.name} msg = (u'Websocket Closed: id=%(session)s t=%(session_time)s ' u'user=%(user)s command=' % (msg_data)) log.err(msg) def sendMessage(self, payload, is_binary=False): if isinstance(payload, dict) and '_command' in payload: super(FerryServerProtocol, self).sendMessage( json.dumps(payload, cls=ScrapyJSONEncoder, sort_keys=True), is_binary ) def send_error(self, data, failure): e = failure.value command = data.get('_callback', data.get('_command')) id_ = data.get('_meta', {}).get('id') if isinstance(e, BaseHTTPError): code, reason, message = e.status, e.title, e.body elif isinstance(e, KeyError): requested_command = data.get('_command') code = 4000 reason = "Unknown command" if requested_command: message = 'No command named "%s" found.' % requested_command else: message = "No command received" else: code = 500 reason = "Internal Server Error" message = "An unexpected error has occurred." log.err(failure) event_id = getattr(failure, 'sentry_event_id', None) if event_id: message = "%s (Event ID: %s)" % (message, event_id) response = { 'error': code, 'reason': reason, 'message': message, } if command: response['_command'] = command if id_: response['id'] = id_ self.sendMessage(response) def getElementByNodeId(self, nodeid): self.tab.web_page.mainFrame().evaluateJavaScript( 'livePortiaPage.pyGetByNodeId(%s)' % nodeid ) return self.js_api.getReturnedElement() def open_tab(self, meta=None): if meta is None: meta = {} manager = SplashQNetworkAccessManager( request_middlewares=[], response_middlewares=[], verbosity=defaults.VERBOSITY ) manager.setCache(None) data = {} data['uid'] = id(data) self.factory[self].tab = PortiaBrowserTab( network_manager=manager, splash_proxy_factory=None, verbosity=defaults.VERBOSITY, render_options=RenderOptions(data, defaults.MAX_TIMEOUT), visible=True, ) manager.tab = self.tab self.tab.register_callback('on_request', self._configure_requests) self.tab.register_callback('on_response', self._set_tab_html) main_frame = self.tab.web_page.mainFrame() cookiejar = PortiaCookieJar(self.tab.web_page, self) manager.cookiejar = cookiejar manager.setCookieJar(cookiejar) if meta.get('cookies'): cookiejar.put_client_cookies(meta['cookies']) main_frame.loadStarted.connect(self._on_load_started) main_frame.loadFinished.connect(self._on_load_finished) self.js_api = PortiaJSApi(self) main_frame.javaScriptWindowObjectCleared.connect( self._on_javascript_window_cleared) main_frame.initialLayoutCompleted.connect(self._on_layout_completed) self.tab.set_images_enabled(True) self.tab.set_viewport(meta.get('viewport') or _DEFAULT_VIEWPORT) self.tab.set_user_agent(meta.get('user_agent') or _DEFAULT_USER_AGENT) self.tab.loaded = False def _on_load_started(self): self.load_id = short_guid() self.sendMessage({'_command': 'loadStarted', 'id': self.load_id, 'url': self.tab.url}) self.tab.initial_layout_completed = False def _on_load_finished(self): if getattr(self.tab, '_raw_url', None) != self.tab.url: page = self.tab.web_page page.triggerAction(page.ReloadAndBypassCache, False) self.sendMessage({'_command': 'loadFinished', 'url': self.tab.url, 'id': getattr(self, 'load_id', None)}) def _configure_requests(self, request, operation, data): if request.hasRawHeader('Accept'): url = six.binary_type(request.url().toEncoded()) url_path = urlparse(url).path.lower() accepts = str(request.rawHeader('Accept')).lower() if (accepts.startswith(STORED_TYPES) or _is_xml(accepts) or url_path.endswith(STORED_EXTENSIONS)): request.track_response_body = True elif (accepts.startswith(IGNORED_TYPES) or url_path.endswith(MEDIA_EXTENSIONS)): drop_request(request) def _set_tab_html(self, reply, har, content): url = decode(reply.url().toString()) if content is not None and url == self.tab.url: self.tab._raw_html = decode(content) self.tab._raw_url = url def _on_layout_completed(self): if not getattr(self.tab, 'initial_layout_completed', False): self.populate_window_object() self.tab.initial_layout_completed = True def _on_javascript_window_cleared(self): if getattr(self.tab, 'initial_layout_completed', False): self.populate_window_object() def populate_window_object(self): main_frame = self.tab.web_page.mainFrame() main_frame.addToJavaScriptWindowObject('__portiaApi', self.js_api) self.tab.run_js_files( os.path.join(self.assets, 'splash_content_scripts'), handle_errors=False) def open_spider(self, meta, storage=None, project=None): if not (meta.get('project') and meta.get('spider')): return {'error': 4005, 'reason': 'No project specified'} if (self.user.authorized_projects is not None and meta['project'] not in self.user.authorized_projects and not self.user.staff): return {'error': 4004, 'reason': 'Project "%s" not found' % meta['project']} spider_name = meta['spider'] if project is None: project = Project(storage, id=meta.get('project')) try: spider_model = project.spiders[spider_name] except (IOError, KeyError): return {'error': 4004, 'reason': 'Spider "%s" not found' % spider_name} spider_name, spider, items, extractors = load_spider_data(spider_model) if not self.settings.get('SPLASH_URL'): self.settings.set('SPLASH_URL', 'portia') self.factory[self].spider = IblSpider(spider_name, spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec( project, spider_name, spider, items, extractors) def __repr__(self): return '{}({})'.format(self.__class__.__name__, str(self)) def __str__(self): tab, spider, spec = '', '', '' if self.tab: try: tab = '{}({})'.format( self.tab.__class__.__name__, self.tab.url) except RuntimeError: tab = 'MISSING' if self.spider: spider = '{}({})'.format( self.spider.__class__.__name__, self.spider.name) if self.spiderspec: spec = str(self.spiderspec) return ', '.join(filter(bool, (tab, spider, spec)))
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from les4.les.gb_parse.spiders.autoyoula import AutoyoulaSpider if __name__ == '__main__': crawler_settings = Settings() crawler_settings.setmodule('gb_parse.settings') crawler_process = CrawlerProcess(settings=crawler_settings) crawler_process.crawl(AutoyoulaSpider) crawler_process.start()
def setUp(self): self.crawler = Crawler(DefaultSpider, Settings())
class FerryServerProtocol(WebSocketServerProtocol): _handlers = { 'load': load_page, 'interact': interact_page, 'close_tab': close_tab, 'heartbeat': lambda d, s: None, 'resize': resize, 'resolve': resolve, 'extract_items': extract_items, 'save_html': save_html, 'update_spider': update_spider } assets = './' settings = Settings() @property def tab(self): return self.factory[self].tab @property def spider(self): return self.factory[self].spider @property def spiderspec(self): return self.factory[self].spiderspec @property def user(self): return self.factory[self] def onConnect(self, request): try: auth_info = json.loads(request.headers['x-auth-info']) except (KeyError, TypeError): return self.session_id = '' self.auth_info = auth_info self.factory[self] = User(auth_info) def onOpen(self): if self not in self.factory: self.sendClose(1000, 'Invalid Connection missing required ' 'parameters') def onMessage(self, payload, isbinary): close_old_connections() pool = getattr(Repoman, 'pool', None) payload = payload.decode('utf-8') data = json.loads(payload) project = data.get('project', data.get('_meta', {}).get('project')) self.storage = create_project_storage(project, author=self.user) projects = self.storage.__class__.get_projects(self.user) if project and str(project) not in projects: self.sendMessage({'status': 4004, 'message': 'Project not found'}) return if pool is not None: deferred = defer.maybeDeferred( pool.run_deferred_with_connection, wrap_callback, self._on_message, self.storage, data=data) else: deferred = defer.maybeDeferred( wrap_callback, None, self._on_message, self.storage, data=data) deferred.addCallbacks(self.sendMessage, partial(self.send_error, data)) def _on_message(self, data): if '_meta' in data and 'session_id' in data['_meta']: self.session_id = data['_meta']['session_id'] if is_blacklisted(data.get('url', ''), self.settings): blacklist_error(data, self) return command = data['_command'] with data_store_context(): result = self._handlers[command](data, self) if result: result.setdefault('_command', data.get('_callback', command)) if '_meta' in data and 'id' in data['_meta']: result['id'] = data['_meta']['id'] return result def onClose(self, was_clean, code, reason): if self in self.factory: if self.tab is not None: self.tab.close() self.tab.network_manager.closed = True msg_data = {'session': self.session_id, 'session_time': 0, 'user': self.user.name} msg = (u'Websocket Closed: id=%(session)s t=%(session_time)s ' u'user=%(user)s command=' % (msg_data)) log.err(msg) def sendMessage(self, payload, is_binary=False): if isinstance(payload, dict) and '_command' in payload: super(FerryServerProtocol, self).sendMessage( json.dumps(payload, cls=ScrapyJSONEncoder, sort_keys=True), is_binary ) self.factory[self].spider, self.storage = None, None def send_error(self, data, failure): e = failure.value command = data.get('_callback', data.get('_command')) id_ = data.get('_meta', {}).get('id') if isinstance(e, BaseHTTPError): code, reason, message = e.status, e.title, e.body elif isinstance(e, KeyError): requested_command = data.get('_command') code = 4000 reason = "Unknown command" if requested_command: message = 'No command named "%s" found.' % requested_command else: message = "No command received" else: code = 500 reason = "Internal Server Error" message = "An unexpected error has occurred." log.err(failure) event_id = getattr(failure, 'sentry_event_id', None) if event_id: message = "%s (Event ID: %s)" % (message, event_id) response = { 'error': code, 'reason': reason, 'message': message, } if command: response['_command'] = command if id_: response['id'] = id_ self.sendMessage(response) def getElementByNodeId(self, nodeid): self.tab.web_page.mainFrame().evaluateJavaScript( 'livePortiaPage.pyGetByNodeId(%s)' % nodeid ) return self.js_api.getReturnedElement() def open_tab(self, meta=None): if meta is None: meta = {} manager = PortiaNetworkManager( request_middlewares=[], response_middlewares=[], verbosity=defaults.VERBOSITY ) manager.setCache(None) data = {} data['uid'] = id(data) self.factory[self].tab = BrowserTab( network_manager=manager, splash_proxy_factory=None, verbosity=0, render_options=RenderOptions(data, defaults.MAX_TIMEOUT), visible=True, ) manager.tab = self.tab main_frame = self.tab.web_page.mainFrame() cookiejar = PortiaCookieJar(self.tab.web_page, self) manager.cookiejar = cookiejar manager.setCookieJar(cookiejar) if meta.get('cookies'): cookiejar.put_client_cookies(meta['cookies']) main_frame.loadStarted.connect(self._on_load_started) self.js_api = PortiaJSApi(self) main_frame.javaScriptWindowObjectCleared.connect( self.populate_window_object ) self.tab.set_images_enabled(True) self.tab.set_viewport(meta.get('viewport') or _DEFAULT_VIEWPORT) self.tab.set_user_agent(meta.get('user_agent') or _DEFAULT_USER_AGENT) self.tab.loaded = False def _on_load_started(self): self.sendMessage({'_command': 'loadStarted'}) def populate_window_object(self): main_frame = self.tab.web_page.mainFrame() main_frame.addToJavaScriptWindowObject('__portiaApi', self.js_api) self.tab.run_js_files( os.path.join(self.assets, 'splash_content_scripts'), handle_errors=False) def open_spider(self, meta, project=None): if not (meta.get('project') and meta.get('spider')): return {'error': 4005, 'reason': 'No project specified'} if (self.user.authorized_projects is not None and meta['project'] not in self.user.authorized_projects and not self.user.staff): return {'error': 4004, 'reason': 'Project "%s" not found' % meta['project']} spider_name = meta['spider'] # project_meta = meta.get('project') # project_id = (project_meta if isinstance(project_meta, six.string_types) # else project_meta.id) # project = Project(self.storage, id=project_id) if project is None: project = Project(self.storage, id=meta.get('project')) try: spider_model = project.spiders[spider_name] except IOError: return {'error': 4003, 'reason': 'Spider "%s" not found' % spider_name} spider_name, spider, items, extractors = load_spider_data(spider_model) if not self.settings.get('SPLASH_URL'): self.settings.set('SPLASH_URL', 'portia') self.factory[self].spider = IblSpider(spider_name, spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec( project, spider_name, spider, items, extractors) def update_spider(self, meta, spider=None, template=None, items=None, extractors=None): if not hasattr(self.factory[self], 'spiderspec'): return self.open_spider(meta) spec = self.factory[self].spiderspec if spec is None or spec.name != meta.get('spider'): return self.open_spider(meta) items = items or spec.items extractors = extractors or spec.extractors if spider: spider['templates'] = spec.spider['templates'] else: spider = spec.spider if template: for idx, tmpl in enumerate(spider['templates']): if template['original_body'] == tmpl['original_body']: spider['templates'][idx] = template break else: spider['templates'].append(template) self.factory[self].spider = IblSpider(meta['spider'], spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec( spec.project, meta['spider'], spider, items, extractors)
def setUp(self): self.spider = Mock() self.settings = Settings()
def test_feed_export_config_implicit_formats(self): settings = Settings() self.assertEqual( {'items_1.json': {'format': 'json'}, 'items_2.xml': {'format': 'xml'}, 'items_3.csv': {'format': 'csv'}}, feed_process_params_from_cli(settings, ['items_1.json', 'items_2.xml', 'items_3.csv']) )
def test_feed_export_config_mismatch(self): settings = Settings() self.assertRaises( UsageError, feed_process_params_from_cli, settings, ['items1.dat', 'items2.dat'], 'noformat' )
def test_feed_export_config_stdout(self): settings = Settings() self.assertEqual( {'stdout:': {'format': 'pickle'}}, feed_process_params_from_cli(settings, ['-:pickle']) )
def test_feed_export_config_overwrite(self): settings = Settings() self.assertEqual( {'output.json': {'format': 'json', 'overwrite': True}}, feed_process_params_from_cli(settings, [], None, ['output.json']) )
def setUp(self): self.crawler = mock.MagicMock() self.crawler.settings = Settings() self.crawler.engine.download = mock.MagicMock()
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from edu_parse.spiders.autoyoula import AutoyoulaSpider if __name__ == '__main__': crawler_settings = Settings() crawler_settings.setmodule("edu_parse.settings") crawler_proc= CrawlerProcess(settings=crawler_settings) crawler_proc.crawl(AutoyoulaSpider) crawler_proc.start() pass
def setUp(self): self.tmpname = self.mktemp() with open(self.tmpname + '^', 'w') as f: f.write('0123456789') self.download_request = FileDownloadHandler( Settings()).download_request
def start_crawler(spider, search): # Set up spider spider = TripAdvisorSpider(search=search) # Set up settings settings = Settings() # settings.overrides['FEED_FORMAT']='csv' # settings.overrides['FEED_URI']='tripadvisor_{0}.csv'.format(search) settings.set('CLOSESPIDER_ITEMCOUNT', False) settings.set('ROBOTSTXT_OBEY', False) settings.set('COOKIES_ENABLED', False) settings.set( 'ITEM_PIPELINES', {'tripadvisor_scraper.pipelines.TripadvisorScraperPipeline': 300}) settings.set('DOWNLOAD_DELAY', 3) settings.set('LOG_FILENAME', 'log.log') # settings.overrides['LOG_FILENAME'] = 'log.log' # settings.overrides['ROBOTSTXT_OBEY'] = False # Ignore robots.txt # settings.overrides['CLOSESPIDER_ITEMCOUNT']=1 # settings.overrides['DOWNLOAD_DELAY'] = 3 # settings.overrides['COOKIES_ENABLED'] = False # settings.overrides['ITEM_PIPELINES'] = { # 'tripadvisor_scraper.pipelines.TripadvisorScraperPipeline': 300, # } # Set up crawler crawler = Crawler(spider, settings) # crawler.configure() crawler.signals.connect(spider_closed, signal=signals.spider_closed) crawler.crawl(spider)
class StorageTest(TestCase): def setUp(self): self.spider = Spider('foo') self.settings = Settings() self.settings.setmodule(default_settings) def tearDown(self): pass def test_environment(self): oldenv = os.environ.copy() os.environ['http_proxy'] = http_proxy = 'https://proxy.for.http:3128' os.environ['https_proxy'] = https_proxy = 'http://proxy.for.https:8080' os.environ.pop('file_proxy', None) settings = deepcopy(self.settings) storage = EnvironmentStorage(settings) storage.open_spider(self.spider) self.assertTrue(storage, True) self.assertIn('http', storage) self.assertIn('https', storage) self.assertNotIn('file_proxy', storage) self.assertSequenceEqual( storage['http'], get_proxy(http_proxy, 'http', storage.auth_encoding)) self.assertSequenceEqual( storage['https'], get_proxy(https_proxy, 'https', storage.auth_encoding)) storage.close_spider(self.spider) os.environ = oldenv def test_settings(self): http_proxy_1 = 'https://proxy.for.http.1:3128' http_proxy_2 = 'https://proxy.for.http.2:3128' https_proxy_1 = 'http://proxy.for.https.1:8080' https_proxy_2 = 'http://proxy.for.https.2:8080' local_settings = { 'HTTPPROXY_ENABLED': True, 'HTTPPROXY_PROXIES': { 'http': [http_proxy_1, http_proxy_2], 'https': [https_proxy_1, https_proxy_2] } } settings = deepcopy(self.settings) settings.setdict(local_settings) storage = SettingsStorage(settings) storage.open_spider(self.spider) self.assertTrue(storage, True) self.assertIn('http', storage) self.assertIn('https', storage) self.assertSequenceEqual( storage['http'], get_proxy(http_proxy_1, 'http', storage.auth_encoding)) storage.close_spider(self.spider)
def main(): global settings from scrapy import cmdline from scrapy.settings import Settings parser = argparse.ArgumentParser(description=__doc__, add_help=False) parser.add_argument('-h', '--help', dest='help', help='获取帮助信息', action='store_true', default=False) act_group = parser.add_argument_group(title='操作选项组') act_group.add_argument('-r', '--run', dest='cmd', help='运行爬虫获取数据', action='store_const', const='runspider') act_group.add_argument('-s', '--shell', dest='cmd', help='控制台调试', action='store_const', const='shell') act_group.add_argument('-v', '--view', dest='cmd', help='使用浏览器打开蜘蛛获取的URL页面', action='store_const', const='view') run_group = parser.add_argument_group(title='运行操作组') run_group.add_argument('-n', '--limit-num', dest='limit', default=0, help='限制总请求次数,默认为0不限制', type=int) run_group.add_argument('-m', '--max-request-num', dest='max', default=30, help='同时最大请求数,默认为30,0则不限制', type=int) run_group.add_argument("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", help="设置爬虫参数(可以重复)") run_group.add_argument("-o", "--output", metavar="FILE", help="输出 items 结果集 值FILE (使用 -o 将定向至 stdout)") run_group.add_argument("-t", "--output-format", metavar="FORMAT", help="基于 -o 选项,使用指定格式输出 items") run_group.add_argument('-d', '--dist', help='分布式运行,用于其他进程提交数据', action='store_true', default=False) gen_group = parser.add_argument_group(title='通用选择项') gen_group.add_argument('-u', '--url', help='设置URL,运行操作设置该项则为起始爬取URL,\ 调试操作设置则为调试URL,查看操作则为打开查看URL' ) args = parser.parse_args() if args.help: parser.print_help() elif args.cmd: settings = Settings(settings) if args.cmd == 'runspider': argv = [sys.argv[0], args.cmd, sys.argv[0]] for vo in run_group._group_actions: opt = vo.option_strings[0] val = args.__dict__.get(vo.dest) if val == vo.default: continue if isinstance(val, (list, tuple)): val = ' '.join(val) if vo.dest == 'limit': settings['CLOSESPIDER_ITEMCOUNT'] = val continue elif vo.dest == 'max': settings['CONCURRENT_REQUESTS'] = val continue elif vo.dest == 'dest': settings['DESTRIBUT_RUN'] = val continue argv.extend([opt, val]) if args.url: argv.extend(['-a', 'START_URL=%s' % args.url]) elif args.cmd == 'shell': argv = [sys.argv[0], args.cmd] if args.url: argv.append(args.url) elif args.cmd == 'view': if not args.url: print('please setting --url option') return None argv = [sys.argv[0], args.cmd, args.url] cmdline.execute(argv, settings) else: parser.print_usage()
def test_feed_export_config_invalid_format(self): settings = Settings() self.assertRaises(UsageError, feed_process_params_from_cli, settings, ['items.dat'], 'noformat')
def setUp(self): self.download_handler = DataURIDownloadHandler(Settings()) self.download_request = self.download_handler.download_request self.spider = Spider('foo')
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from avito_parse import settings from avito_parse.spiders.avito import AvitoSpider if __name__ == '__main__': scr_settings = Settings() scr_settings.setmodule(settings) process = CrawlerProcess(settings=scr_settings) process.crawl(AvitoSpider) process.start()
# import dmoz spider class from DmozSpider import DmozSpider # scrapy api from scrapy import signals, log from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy.settings import Settings def spider_closing(spider): """Activates on spider closed signal""" log.msg("Closing reactor", level=log.INFO) reactor.stop() #log.(loglevel=log.DEBUG) settings = Settings() # crawl responsibly settings.set("USER_AGENT", "Kiran Koduru (+http://kirankoduru.github.io)") crawler = Crawler(DmozSpider(),settings) # stop reactor when spider closes crawler.signals.connect(spider_closing, signal=signals.spider_closed) #crawler.configure() crawler.crawl() #crawler.start() reactor.run()
def setUp(self): self.tempdir = mkdtemp() self.pipeline = FilesPipeline.from_settings( Settings({'FILES_STORE': self.tempdir})) self.pipeline.download_func = _mocked_download_func self.pipeline.open_spider(None)
def setUp(self): crawler = get_crawler(Spider) self.spider = Spider.from_crawler(crawler, name='foo') self.mw = HttpErrorMiddleware(Settings({})) self.req = Request('http://scrapytest.org') self.res200, self.res404 = _responses(self.req, [200, 404])
def test_enabled_from_settings(self): settings = Settings() mwman = TestMiddlewareManager.from_settings(settings) classes = [x.__class__ for x in mwman.middlewares] self.assertEqual(classes, [M1, M3])
"//*[contains(@itemprop, 'track')]//@href").extract() for link in link_to_lyrics: yield Request(url=''.join(('http://www.songteksten.nl', link)), callback=self.parse_lyrics) def parse_lyrics(self, response): hxs = Selector(response) item = LyricsSearchItem() item['lyrics'] = hxs.xpath( "//*[contains(@itemprop, 'description')]/text()").extract() return item def callback(spider, reason): stats = spider.crawler.stats.get_stats() reactor.stop() settings = Settings() settings.set('ITEM_PIPELINES', {'pipeline.LyricsSearchPipeline': 100}) def crawl(): crawler = Crawler(settings) spider = MySpider() crawler.signals.connect(callback, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() reactor.run()
def setUp(self): self.tmpname = self.mktemp() fd = open(self.tmpname + '^', 'w') fd.write('0123456789') fd.close() self.download_request = FileDownloadHandler(Settings()).download_request
SUBMIT_TYPES = {'submit button'} DEFAULT_POST_HEADERS = {b'Content-Type': b'application/x-www-form-urlencoded'} USER_AGENT = ('Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 ' '(KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 ' 'Chrome/43.0.2357.130 Safari/537.36') base_settings = Settings(values=dict( TELNETCONSOLE_ENABLED=False, ROBOTSTXT_OBEY=False, DOWNLOAD_DELAY=2.0, DEPTH_PRIORITY=1, CONCURRENT_REQUESTS=2, CONCURRENT_REQUESTS_PER_DOMAIN=2, SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleFifoDiskQueue', SCHEDULER_MEMORY_QUEUE='scrapy.squeues.FifoMemoryQueue', # DOWNLOADER_MIDDLEWARES are set in get_settings USER_AGENT=USER_AGENT, DOWNLOADER_MIDDLEWARES={ 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, # Placed before splash middleware 'autologin.middleware.ProxyMiddleware': 720, }, )) def crawl_runner(extra_settings=None): settings = base_settings.copy() if extra_settings is not None: settings.update(extra_settings, priority='cmdline') if settings.get('SPLASH_URL'):
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from bookparser import settings from bookparser.spiders.book24 import Book24Spider from bookparser.spiders.labirint import LabirintSpider if __name__ == "__main__": crawler_settings = Settings() crawler_settings.setmodule(settings) process = CrawlerProcess(settings=crawler_settings) process.crawl(Book24Spider) process.crawl(LabirintSpider) process.start()
def setUp(self): self.spider = Spider('foo') self.settings = Settings() self.settings.setmodule(default_settings)
def setUp(self): self.spider = Spider('foo') self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True})) self.req = Request('http://scrapytest.org') self.res200, self.res404, self.res402 = _responses( self.req, [200, 404, 402])