def __init__(self, datadir, spider_cls=None, settings=None, **kwargs): self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider self._specs = open_project_from_dir(datadir) settings = settings.copy() settings.frozen = False settings.set('PLUGINS', load_plugins(settings)) self.settings = settings
def test_trained(self): base = 'http://www.daft.ie/ireland/houses-for-sale/?offset={}'.format daft_url = base(10) spec = { 'start_urls': [daft_url], 'links_to_follow': 'auto', 'respect_nofollow': False, 'follow_patterns': [], 'exclude_patterns': [], 'init_requests': [], 'templates': [daft_sample] } settings = Settings() settings.set('LOADED_PLUGINS', load_plugins(settings)) spider = IblSpider('hn', spec, {}, {}, settings=settings) request = Request(daft_url) response = HtmlResponse(url=daft_url, body=daft_body, request=request, encoding="utf-8") data = { r.url for r in spider.handle_html(response) if isinstance(r, Request) } self.assertEqual({base(i) for i in (90, 80, 70)}, data)
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates) ) if not self.allowed_domains: self.allowed_domains = None
def _configure_plugins(self, settings, spec, schemas, extractors): plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, schemas, extractors) plugins[plugin_name] = instance return plugins
def _configure_plugins(self, settings, spec, schemas, extractors): plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, schemas, extractors, self.logger) plugins[plugin_name] = instance return plugins
def __init__(self, datadir, spider_cls=None, settings=None, **kwargs): logging.info('Slybot %s Spider', slybot.__version__) if settings is None: settings = get_project_settings() self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider self._specs = open_project_from_dir(datadir) settings = settings.copy() settings.frozen = False settings.set('LOADED_PLUGINS', load_plugins(settings)) self.settings = settings
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) self._job_id = settings.get('JOB', '') spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec) self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates)) self.page_actions = spec.get('page_actions', []) if not self.allowed_domains: self.allowed_domains = None
def __init__(self, datadir, spider_cls=None, settings=None, **kwargs): logging.info('Slybot %s Spider', slybot.__version__) if is_zipfile(datadir): tempdir = tempfile.mkdtemp(prefix='slybot-') ZipFile(datadir).extractall(tempdir) atexit.register(shutil.rmtree, tempdir) datadir = tempdir if settings is None: settings = get_project_settings() self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider self._specs = open_project_from_dir(datadir) settings = settings.copy() settings.frozen = False settings.set('LOADED_PLUGINS', load_plugins(settings)) self.settings = settings
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) self._job_id = settings.get('JOB', '') spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec) self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates) ) self.page_actions = spec.get('page_actions', []) if not self.allowed_domains: self.allowed_domains = None
def test_trained(self): base = 'http://www.daft.ie/ireland/houses-for-sale/?offset={}'.format daft_url = base(10) spec = { 'start_urls': [daft_url], 'links_to_follow': 'auto', 'respect_nofollow': False, 'follow_patterns': [], 'exclude_patterns': [], 'init_requests': [], 'templates': [daft_sample] } settings = Settings() settings.set('LOADED_PLUGINS', load_plugins(settings)) spider = IblSpider('hn', spec, {}, {}, settings=settings) request = Request(daft_url) response = UTF8HtmlResponse(url=daft_url, body=daft_body, request=request) data = {r.url for r in spider.handle_html(response) if isinstance(r, Request)} self.assertEqual({base(i) for i in (90, 80, 70)}, data)
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates)) if not self.allowed_domains: self.allowed_domains = None