def render_POST(self, request, merge=False): obj = self.read_json(request) project_spec = self.spec_manager.project_spec(request.project, request.auth_info) resource = None try: # validate the request path and data rpath = request.postpath resource = rpath[0] if resource == 'spiders': resource = 'spider' if len(rpath) == 1 or not rpath[1]: return self.handle_spider_command(project_spec, obj) elif len(rpath) == 2: clean_spider(obj) elif len(rpath) == 3: resource = 'template' if obj.get('original_body') is None: templ = project_spec.template_json(rpath[1], rpath[2]) obj['original_body'] = templ.get('original_body', '') obj = add_plugin_data(obj, project_spec.plugins) get_schema_validator(resource).validate(obj) except (KeyError, IndexError): self.not_found() except (AssertionError, ValidationError) as ex: self.bad_request( "The %s data was not valid. Validation failed with the error: %s." % (resource or 'input', ex.message)) except BaseHTTPError as ex: self.error(ex.status, ex.title, ex.body) else: project_spec.savejson(obj, request.postpath) return ''
def render_POST(self, request, merge=False): obj = self.read_json(request) project_spec = self.spec_manager.project_spec( request.project, request.auth_info) resource = None try: # validate the request path and data rpath = request.postpath resource = rpath[0] if resource == 'spiders': resource = 'spider' if len(rpath) == 1 or not rpath[1]: return self.handle_spider_command(project_spec, obj) elif len(rpath) == 2: clean_spider(obj) elif len(rpath) == 3: resource = 'template' if obj.get('original_body') is None: templ = project_spec.template_json(rpath[1], rpath[2]) obj['original_body'] = templ.get('original_body', '') obj = add_plugin_data(obj, project_spec.plugins) get_schema_validator(resource).validate(obj) except (KeyError, IndexError): self.not_found() except (AssertionError, ValidationError) as ex: self.bad_request( "The %s data was not valid. Validation failed with the error: %s." % (resource or 'input', ex.message)) except BaseHTTPError as ex: self.error(ex.status, ex.title, ex.body) else: project_spec.savejson(obj, request.postpath) return ''
def render_POST(self, request, merge=False): obj = self.read_json(request) project_spec = self.spec_manager.project_spec(request.project, request.auth_info) try: # validate the request path and data rpath = request.postpath resource = rpath[0] if resource == 'spiders': resource = 'spider' if len(rpath) == 1 or not rpath[1]: return self.handle_spider_command(project_spec, obj) elif len(rpath) == 2: clean_spider(obj) elif len(rpath) == 3: resource = 'template' template = obj if obj.get('original_body') is None: template = project_spec.template_json( rpath[1], rpath[2]) original_body = template.get('original_body', '') obj['original_body'] = original_body annotate_template(obj) # Remove annotations field which is not used by slybot obj.pop('annotations', None) get_schema_validator(resource).validate(obj) except (KeyError, IndexError): self.error(404, "Not Found", "No such resource") except ValidationError as ex: self.bad_request("Json failed validation: %s" % ex.message) except BaseHTTPError as ex: self.error(ex.status, ex.title, ex.body) else: project_spec.savejson(obj, request.postpath) return ''
def test_valid_url(self): legacy_start_urls = [ 'http://www.example.com/', 'http://www.example.com/經濟', 'http://www.example.com/?q=經濟', 'http://www.example.com/#經濟', 'http://faß.de', 'http://例.jp/', 'http://[2001:0000:1234:0000:0000:C1C0:ABCD:0876]/foo/bar', 'http://[2001::]/foo/bar', 'http://8.8.8.8/foo/bar', 'http://*****:*****@localhost:8080/foo/bar', 'http://*****:*****@localhost:8080/foo/bar', 'http://domain.com/path/file.html?param=FOO^111¶m2=bar¶m3=true&_param4=on', # Anonymized URL form sentry d46840d2457c4042b1b58f2fa40e984b 'https://domain.com/path/file.htm?param=foo#hash/foo/bar/baz:foo|bar:baz', # Anonymized URL from sentry 01dd2fa09d9540b69ebd33372b2b3a2d 'https://domain.com/path/file.htm?param=foo#hash/foo/bar/baz:foo|bar%5B%5D:12345', # Anonymized URL from sentry 87d49ee751494c90a8941dcbdacea634 'http://domain.com/path?bar[foo]=baz&foo[bar]=12345', # Anonymized URL from sentry 9f6835f5decd4d57b9475f04f0a58bd4 ] start_urls = map(start_url_schema, legacy_start_urls) validator = get_schema_validator("spider") self.assertEqual(validator.validate(spider_json(legacy_start_urls)), None) self.assertEqual(validator.validate(spider_json(start_urls)), None)
def test_valid_fragments(self): fragments = ([{ 'type': 'fixed', 'value': 'domain.com' }, { 'type': 'range', 'value': '0-10' }], [{ 'type': 'range', 'value': '0-10' }], [{ 'type': 'list', 'value': 'a b c' }], [{ 'type': 'list', 'value': 'one_element' }]) validator = get_schema_validator("spider") for fragment in fragments: start_url = [{ 'url': 'http://domain.com', 'type': 'generated', 'fragments': fragment }] self.assertEqual(validator.validate(spider_json(start_url)), None)
def test_regex_formatting_ok(self): obj = { "0": { "regular_expression": "Item: (\d+)" } } validator = get_schema_validator("extractors") self.assertEqual(validator.validate(obj), None)
def test_valid_url(self): obj = { "start_urls": ['http://www.example.com/'], "links_to_follow": "none", "respect_nofollow": True, "templates": [], } validator = get_schema_validator("spider") self.assertEqual(validator.validate(obj), None)
def test_regex_formatting_wrong(self): obj = { "0": { "regular_expression": "Item: (\d+" } } validator = get_schema_validator("extractors") self.assertRaisesRegexp(ValidationError, "Invalid regular expression", validator.validate, obj)
def test_regex_formatting_wrong(self): obj = { "0": { "regular_expression": "Item: (\d+" } } validator = get_schema_validator("extractors") with self.assertRaises(ValidationError): validator.validate(obj)
def render_POST(self, request): obj = self.read_json(request) project_spec = self.spec_manager.project_spec(request.project) try: # validate the request path and data resource = request.postpath[0] if resource == 'spiders': if len(request.postpath) == 1 or not request.postpath[1]: return self.handle_spider_command(project_spec, obj) annotate_templates(obj) resource = 'spider' get_schema_validator(resource).validate(obj) except (KeyError, IndexError) as _ex: self.error(404, "Not Found", "No such resource") except ValidationError as ex: self.bad_request("Json failed validation: %s" % ex.message) project_spec.savejson(obj, request.postpath) return ''
def verify_data(self, path=None, obj=None, project_spec=None): if not path or obj is None or project_spec is None: raise self.errors.BadRequest('No path received') resource = path[0] if path[0] == 'spiders': resource = 'spider' if len(path) == 1 or not path[1]: return self.handle_spider_command(project_spec, obj) elif len(path) == 2: clean_spider(obj) elif len(path) == 3: resource = 'template' if obj.get('original_body') is None: templ = project_spec.template_json(path[1], path[2]) obj['original_body'] = templ.get('original_body', '') obj = add_plugin_data(obj, project_spec.plugins) get_schema_validator(resource).validate(obj) return obj
def test_invalid_url(self): obj = { "start_urls": ['www.example.com'], "links_to_follow": "none", "respect_nofollow": True, "templates": [], } validator = get_schema_validator("spider") self.assertRaisesRegexp(ValidationError, "Invalid url:", validator.validate, obj)
def test_valid_mixed_fragments(self): start_urls = [ {'type': 'url', 'url': 'http://www.example.com/'}, {'type': 'generated', 'url': 'http://', 'fragments': [ {'type': 'fixed', 'value': 'http://'} ]}, ] validator = get_schema_validator("spider") self.assertEqual(validator.validate(spider_json(start_urls)), None)
def test_schema_format(self): validator = get_schema_validator('spider') spider = { 'start_urls_type': 'generated_urls', 'start_urls': [], 'links_to_follow': 'none', 'respect_nofollow': True } for spec in self.specs.values(): spider['generated_urls'] = spec validator.validate(spider)
def test_invalid_url(self): legacy_start_urls = ( 12345, # Not a string 'example.com', # Lacks protocol 'http://[:::1]/foo/bar', # Bad IPv6 addr 'http://http://foo.com/bar', # Double protocol 'spotify:foobar', # Not http/s protocol '/foo', # relative '?foo', # relative '#foo', # relative ) validator = get_schema_validator("spider") for invalid_url in legacy_start_urls: with self.assertRaises(ValidationError): validator.validate(spider_json([invalid_url]))
def test_invalid_url(self): for invalid_url in ( 12345, # Not a string 'example.com', # Lacks protocol 'http://[:::1]/foo/bar', # Bad IPv6 addr '/foo', # relative '?foo', # relative '#foo', # relative ): obj = { "start_urls": [invalid_url], "links_to_follow": "none", "respect_nofollow": True, "templates": [], } validator = get_schema_validator("spider") with self.assertRaises(ValidationError): validator.validate(obj)
def test_valid_fragments(self): fragments = ( [ {'type': 'fixed', 'value': 'domain.com'}, {'type': 'range', 'value': '0-10'} ], [{'type': 'range', 'value': '0-10'}], [{'type': 'list', 'value': 'a b c'}], [{'type': 'list', 'value': 'one_element'}] ) validator = get_schema_validator("spider") for fragment in fragments: start_url = [{ 'url': 'http://domain.com', 'type': 'generated', 'fragments': fragment }] self.assertEqual(validator.validate(spider_json(start_url)), None)
def test_valid_url(self): obj = { "start_urls": [ 'http://www.example.com/', 'http://www.example.com/經濟', 'http://www.example.com/?q=經濟', 'http://www.example.com/#經濟', 'http://faß.de', 'http://例.jp/', 'http://[2001:0000:1234:0000:0000:C1C0:ABCD:0876]/foo/bar', 'http://[2001::]/foo/bar', 'http://8.8.8.8/foo/bar', ], "links_to_follow": "none", "respect_nofollow": True, "templates": [], } validator = get_schema_validator("spider") self.assertEqual(validator.validate(obj), None)
def test_regex_formatting_wrong(self): obj = {"0": {"regular_expression": "Item: (\d+"}} validator = get_schema_validator("extractors") self.assertRaisesRegexp(ValidationError, "Invalid regular expression", validator.validate, obj)
def validump_resource(jsonres, restype): get_schema_validator(restype).validate(jsonres) return json.dumps(jsonres)