def test_can_get_default_violations_values(self): config = Config() config.BLACKLIST_DOMAIN = ['a.com'] page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = BlackListValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('blacklist.domains') expect(violations_values['blacklist.domains']).to_length(2) expect(violations_values['blacklist.domains']).to_be_like({ 'value': config.BLACKLIST_DOMAIN, 'description': config.get_description('BLACKLIST_DOMAIN') })
def test_add_violation_when_sitemap_with_good_link(self): page = PageFactory.create(url='http://globo.com') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) validator = SitemapValidator(reviewer) validator.review.data['sitemap.files.size'] = { 'http://g1.globo.com/sitemap.xml': 10 } validator.review.data['sitemap.data'] = { 'http://g1.globo.com/sitemap.xml': Mock(status_code=200, text='data', url='http://g1.globo.com/%C3%BCmlat.php&q=name') } validator.review.data['sitemap.files.urls'] = { 'http://g1.globo.com/sitemap.xml': 20 } validator.review.data['sitemap.urls'] = { 'http://g1.globo.com/sitemap.xml': ['http://g1.globo.com/%C3%BCmlat.php&q=name'] } validator.add_violation = Mock() validator.flush = Mock() validator.validate() expect(validator.add_violation.call_count).to_equal(0) expect(validator.flush.call_count).to_equal(1)
def test_add_violation_when_sitemap_is_too_large(self): page = PageFactory.create(url='http://globo.com') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) validator = SitemapValidator(reviewer) validator.review.data['sitemap.files.size'] = { 'http://g1.globo.com/sitemap.xml': 10241 } validator.review.data['sitemap.data'] = { 'http://g1.globo.com/sitemap.xml': Mock(status_code=200, text='data') } validator.review.data['sitemap.files.urls'] = { 'http://g1.globo.com/sitemap.xml': 10 } validator.review.data['sitemap.urls'] = { 'http://g1.globo.com/sitemap.xml': [] } validator.add_violation = Mock() validator.validate() validator.add_violation.assert_called_once_with( key='total.size.sitemap', value={ 'url': 'http://g1.globo.com/sitemap.xml', 'size': 10.0009765625 }, points=10)
def test_can_get_default_violations_values(self): config = Config() config.REQUIRED_META_TAGS = ['description'] page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = RequiredMetaTagsValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('absent.meta.tags') expect(violations_values['absent.meta.tags']).to_length(2) expect(violations_values['absent.meta.tags']).to_be_like({ 'value': config.REQUIRED_META_TAGS, 'description': config.get_description('REQUIRED_META_TAGS') })
def get_reviewer(self, api_url=None, page_uuid=None, page_url='http://page.url', page_score=0.0, config=None): if api_url is None: api_url = self.get_url('/') if page_uuid is None: page_uuid = str(uuid4()) if config is None: config = self.config return Reviewer( api_url=api_url, page_uuid=str(page_uuid), page_url=page_url, page_score=0.0, config=config, validators=self.validators, facters=self.facters, search_provider=self.search_provider, wait=self.otto.wait, wait_timeout=0, # max time to wait for all requests to finish db=self.db, cache=self.cache, publish=self.publish, async_get=self.async_get, fact_definitions=self.fact_definitions, violation_definitions=self.violation_definitions, )
def test_can_validate_with_headers(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) validator = LastModifiedValidator(reviewer) validator.add_violation = Mock() validator.review.data = { 'page.last_modified': datetime.datetime(2014, 1, 13, 1, 16, 10) } validator.review.facts = { 'page.last_modified': datetime.datetime(2014, 1, 13, 1, 16, 10) } validator.validate() expect(validator.add_violation.called).to_be_false()
def test_can_get_default_violations_values(self): config = Config() config.FORCE_CANONICAL = False page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = LinkWithRelCanonicalValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('absent.meta.canonical') expect(violations_values['absent.meta.canonical']).to_length(2) expect(violations_values['absent.meta.canonical']).to_be_like({ 'value': config.FORCE_CANONICAL, 'description': config.get_description('FORCE_CANONICAL') })
def test_validate(self): config = Config() page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) content = '<html><body></body></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = SchemaOrgItemTypeValidator(reviewer) validator.add_violation = Mock() validator.review.data = {'page.body': [{}]} validator.validate() expect(validator.add_violation.call_args_list).to_include( call(key='absent.schema.itemscope', value=None, points=10)) expect(validator.add_violation.call_args_list).to_include( call(key='absent.schema.itemtype', value=None, points=10))
def test_can_get_default_violations_values(self): config = Config() config.MAX_TITLE_SIZE = 70 page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = TitleValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('page.title.size') expect(violations_values['page.title.size']).to_length(2) expect(violations_values['page.title.size']).to_be_like({ 'value': config.MAX_TITLE_SIZE, 'description': config.get_description('MAX_TITLE_SIZE') })
def test_handle_sitemap_url_loaded(self): page = PageFactory.create(url="http://g1.globo.com/") reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) reviewer.enqueue = Mock() content = self.get_file('url_sitemap.xml') response = Mock(status_code=200, text=content) facter = SitemapFacter(reviewer) facter.async_get = Mock() facter.get_facts() facter.handle_sitemap_loaded("http://g1.globo.com/sitemap.xml", response) expect(facter.review.data['sitemap.files.size']["http://g1.globo.com/sitemap.xml"]).to_equal(0.296875) expect(facter.review.data['sitemap.urls']["http://g1.globo.com/sitemap.xml"]).to_equal(set(['http://domain.com/1.html', 'http://domain.com/2.html'])) expect(facter.review.facts['total.size.sitemap']['value']).to_equal(0.296875) expect(facter.review.facts['total.size.sitemap.gzipped']['value']).to_equal(0.1494140625) expect(facter.review.data['total.size.sitemap']).to_equal(0.296875) expect(facter.review.data['total.size.sitemap.gzipped']).to_equal(0.1494140625) expect(facter.review.data['sitemap.files.urls']["http://g1.globo.com/sitemap.xml"]).to_equal(2) expect(facter.review.facts['total.sitemap.urls']['value']).to_equal(2)
def test_can_get_default_violations_values(self): config = Config() config.SCHEMA_ORG_ITEMTYPE = [ 'http://schema.org/WebPage', 'http://schema.org/AboutPage', ] page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = SchemaOrgItemTypeValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('invalid.schema.itemtype') expect(violations_values['invalid.schema.itemtype']).to_length(2) expect(violations_values['invalid.schema.itemtype']).to_equal({ 'value': config.SCHEMA_ORG_ITEMTYPE, 'description': config.get_description('SCHEMA_ORG_ITEMTYPE') })
def test_can_get_facts(self): page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[]) content = '<html><body class="test"></body></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = BodyFacter(reviewer) facter.add_fact = Mock() facter.get_facts() expect(facter.review.data).to_length(1) expect(facter.review.data).to_include('page.body') expect(facter.review.data['page.body'][0].tag).to_equal('body') expect(facter.add_fact.called).to_be_false()
def test_validate(self): page = PageFactory.create(url='http://globo.com/') reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) validator = RobotsValidator(reviewer) response = Mock(status_code=200, text='key:value') validator.review.data['robots.response'] = response validator.add_violation = Mock() validator.validate() expect(validator.add_violation.call_args_list).to_include( call( key='robots.sitemap.not_found', value=None, points=100 )) expect(validator.add_violation.call_args_list).to_include( call( key='robots.disallow.not_found', value=None, points=100 ))
def test_can_get_default_violations_values(self): config = Config() config.MAX_HEADING_HIEARARCHY_SIZE = 150 page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = HeadingHierarchyValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('page.heading_hierarchy.size') expect(violations_values['page.heading_hierarchy.size']).to_length(2) expect(violations_values['page.heading_hierarchy.size']).to_equal({ 'value': config.MAX_HEADING_HIEARARCHY_SIZE, 'description': config.get_description('MAX_HEADING_HIEARARCHY_SIZE') })
def test_can_validate_without_meta_tags(self): config = Config() page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) content = '<html></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = OpenGraphValidator(reviewer) validator.add_violation = Mock() validator.validate() expect(validator.add_violation.called).to_be_false()
def test_can_validate_no_title_tag(self): page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) content = '<html></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = TitleValidator(reviewer) validator.add_violation = Mock() validator.validate() validator.add_violation.assert_called_once_with( key='page.title.not_found', value=page.url, points=50)
def test_can_validate_last_modified(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) validator = LastModifiedValidator(reviewer) validator.add_violation = Mock() validator.review.data = { 'page.last_modified': None } validator.review.facts = { 'page.last_modified': None } validator.validate() validator.add_violation.assert_called_once_with( key='page.last_modified.not_found', value=page.url, points=50 )
def test_handle_url_loaded_with_empty_content(self): page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[]) content = '' result = { 'url': page.url, 'status': 200, 'content': content, 'html': content } reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = JSFacter(reviewer) facter.async_get = Mock() facter.get_facts() facter.handle_url_loaded(page.url, response) expect(facter.review.data).to_include('total.size.js') expect(facter.review.data['total.size.js']).to_equal(0) expect(facter.review.data).to_include('total.size.js.gzipped') expect(facter.review.data['total.size.js.gzipped']).to_equal(0)
def test_validate(self): config = Config() page = PageFactory.create(url='http://globo.com/1?item=test') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) content = '<html><head></head></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = LinkWithRelCanonicalValidator(reviewer) validator.add_violation = Mock() validator.review.data = {'page.head': [{}]} validator.validate() expect(validator.add_violation.call_args_list).to_include( call(key='absent.meta.canonical', value=None, points=30))
def test_can_get_default_violations_values(self): config = Config() config.METATAG_DESCRIPTION_MAX_SIZE = 300 page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[] ) validator = MetaTagsValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('page.metatags.description_too_big') expect(violations_values['page.metatags.description_too_big']).to_length(2) expect(violations_values['page.metatags.description_too_big']).to_be_like({ 'value': config.METATAG_DESCRIPTION_MAX_SIZE, 'description': config.get_description('METATAG_DESCRIPTION_MAX_SIZE') })
def test_query_string_without_params(self): config = Config() config.FORCE_CANONICAL = False page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) content = '<html><head></head></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = LinkWithRelCanonicalValidator(reviewer) validator.add_violation = Mock() validator.review.data = {'page.head': [{}]} validator.validate() expect(validator.add_violation.called).to_be_false()
def test_can_validate_page_with_metatag_description_too_long(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache ) reviewer.violation_definitions = { 'page.metatags.description_too_big': {'default_value': 300}, } validator = MetaTagsValidator(reviewer) validator.add_violation = Mock() validator.review.data['meta.tags'] = [ {'content': 'X' * 301, 'property': 'name', 'key': 'description'}, ] validator.validate() validator.add_violation.assert_called_once_with( key='page.metatags.description_too_big', value={'max_size': 300}, points=20 ) validator.add_violation = Mock() validator.review.data['meta.tags'] = [ {'content': 'X' * 300, 'property': 'name', 'key': 'description'}, ] validator.validate() expect(validator.add_violation.called).to_be_false()
def test_get_robots_from_root_domain(self): page = PageFactory.create(url="http://www.globo.com") reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) facter = RobotsFacter(reviewer) facter.async_get = Mock() facter.add_fact = Mock() facter.get_facts() robots_url = 'http://www.globo.com/robots.txt' expect(facter.review.data).to_length(1) expect(facter.review.data['robots.response']).to_equal(None) facter.async_get.assert_called_once_with(robots_url, facter.handle_robots_loaded) response = Mock(status_code=200, text='', headers={}) facter.handle_robots_loaded(robots_url, response) expect(facter.review.data['robots.response']).to_equal(response) expect(facter.add_fact.call_args_list).to_include( call( key='robots.url', value=robots_url, ))
def test_can_validate_css_requests_empty_html(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache ) reviewer.violation_definitions = { 'total.requests.css': {'default_value': 1}, 'total.size.css': {'default_value': 0.0}, } result = { 'url': page.url, 'status': 200, 'content': None, 'html': None } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = CSSRequestsValidator(reviewer) validator.add_violation = Mock() validator.validate() expect(validator.add_violation.called).to_be_false()
def test_add_violation_when_sitemap_has_links_that_not_need_to_be_encoded( self): page = PageFactory.create(url='http://globo.com') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) validator = SitemapValidator(reviewer) validator.review.data['sitemap.files.size'] = { 'http://g1.globo.com/sitemap.xml': 10 } validator.review.data['sitemap.data'] = { 'http://g1.globo.com/sitemap.xml': Mock(status_code=200, text='data') } validator.review.data['sitemap.files.urls'] = { 'http://g1.globo.com/sitemap.xml': 20 } validator.review.data['sitemap.urls'] = { 'http://g1.globo.com/sitemap.xml': ['http://g1.globo.com/1.html'] } validator.add_violation = Mock() validator.validate() expect(validator.add_violation.call_count).to_equal(0)
def test_can_validate_css_requests_on_globo_html(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache ) reviewer.violation_definitions = { 'total.requests.css': {'default_value': 1}, 'total.size.css': {'default_value': 0.0}, } content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = CSSRequestsValidator(reviewer) css = { 'url': 'some_style.css', 'status': 200, 'content': '#id{display:none}', 'html': None } validator.get_response = Mock(return_value=css) validator.add_violation = Mock() validator.review.data = { 'total.requests.css': 7, 'total.size.css.gzipped': 0.05 } validator.validate() expect(validator.add_violation.call_args_list).to_include( call( key='total.requests.css', value={'over_limit': 6, 'total_css_files': 7}, points=30 )) expect(validator.add_violation.call_args_list).to_include( call( key='total.size.css', value=0.05, points=0 ))
def test_add_violation_when_404(self): page = PageFactory.create(url='http://globo.com') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) validator = SitemapValidator(reviewer) validator.review.data['sitemap.files.size'] = { 'http://g1.globo.com/sitemap.xml': 10 } validator.review.data['sitemap.data'] = { 'http://g1.globo.com/sitemap.xml': Mock(status_code=404, text=None) } validator.add_violation = Mock() validator.validate() validator.add_violation.assert_called_once_with( key='sitemap.not_found', value='http://g1.globo.com/sitemap.xml', points=100)
def test_can_load_url_with_empy_headers(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[] ) content = '<html></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content), } reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = LastModifiedFacter(reviewer) facter.add_fact = Mock() facter.get_facts() expect(facter.review.data).to_length(0) expect(facter.review.data).to_be_like({}) expect(facter.add_fact.called).to_be_false()
def _start_reviewer(self, job): if job: if count_url_levels(job['url']) > self.config.MAX_URL_LEVELS: self.info('Max URL levels! Details: %s' % job['url']) return self.debug('Starting Review for [%s]' % job['url']) reviewer = Reviewer( api_url=self.config.HOLMES_API_URL, page_uuid=job['page'], page_url=job['url'], page_score=0, config=self.config, validators=self.validators, facters=self.facters, search_provider=self.search_provider, async_get=self.async_get, wait=self.otto.wait, wait_timeout=0, # max time to wait for all requests to finish db=self.db, cache=self.cache, publish=self.publish, girl=self.girl, fact_definitions=self.fact_definitions, violation_definitions=self.violation_definitions) reviewer.review()
def test_handle_url_loaded(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[] ) content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = LinkFacter(reviewer) facter.async_get = Mock() facter.get_facts() facter.handle_url_loaded(page.url, response) expect(facter.review.data).to_include('page.links') data = set([(page.url, response)]) expect(facter.review.data['page.links']).to_equal(data)