def test_raises_if_attribute_not_found(self): site = Site('http://example.org') extractor = SiteAttributeExtractor('name') resource_info = ResourceInfo(site=site) with self.assertRaises(NoValueExtracted): extractor.extract_value(resource_info)
def test_retrieves_attribute_from_site(self): site = Site('http://example.org', attributes={'name': 'My Site'}) extractor = SiteAttributeExtractor('name') resource_info = ResourceInfo(site=site) extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'My Site', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_applies_site_config_extractors_to_site(self): field = Field('EXAMPLE', extractor=SiteAttributeExtractor('name')) site = Site('http://example.org', attributes={'name': 'My Site'}) resource_info = ResourceInfo(site=site) engine = self._create_engine(fields=[field], resource_info=resource_info) self.assertEquals({'EXAMPLE': u'My Site'}, engine.extract_field_values())
def test_slack_can_format_json_from_data(self): ex = Exception('error message') site = Site('http://some-url.com/') data = json.loads(self.slacklogger.generateAttdata(ex, site))[0] self.assertEquals("http://some-url.com/", data['fields'][0]['value']) self.assertEquals("Exception", data['fields'][1]['value']) self.assertEquals("error message", data['fields'][2]['value'])
def setUp(self): CrawlerTestCase.setUp(self) self.site = Site('http://example.org') self.tika = 'http://localhost:9998' self.solr = 'http://localhost:8983/solr' self.slacktoken = 'token' self.slackchannel = '#channel' self.unique_field = 'UID' self.url_field = 'url' self.last_modified_field = 'modified' self.field = Field('foo', extractor=Extractor()) self.config = Config([self.site], self.unique_field, self.url_field, self.last_modified_field, [self.field], self.tika, self.solr, self.slacktoken, self.slackchannel)
def test_supports_absolute_sitemap_index_urls(self, request): responses = { 'http://example.org/foo/bar/sitemap1.xml': MockResponse(status_code=200, content=SITEMAP_INDEX), 'http://example.org/foo/bar/sitemap2.xml': MockResponse(status_code=200, content=SITEMAP_INDEX), } request.side_effect = lambda url, **kwargs: responses[url] site = Site('http://example.org/foo/', sitemap_urls=[ 'http://example.org/foo/bar/sitemap1.xml', 'http://example.org/foo/bar/sitemap2.xml' ]) sm_idx_fetcher = SitemapIndexFetcher(site) sitemap_index = sm_idx_fetcher.fetch() self.assertEquals(2, len(sitemap_index.sitemaps))
def test_sleeps_and_retries_when_too_many_requests(self, request): responses = [ MockResponse(status_code=429), MockResponse(status_code=429), MockResponse(content='', headers={'Content-Type': 'text/html'}), ] request.side_effect = lambda url, **kw: responses.pop(0) resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'}, site=Site('http://example.org/')) fetcher = self._create_fetcher(resource_info) logging.disable(logging.INFO) with LogCapture() as log: resource_info = fetcher.fetch() log.check(('ftw.crawler.fetcher', 'WARNING', u'429 Too Many Requests, sleeping for 0.1s'), ('ftw.crawler.fetcher', 'WARNING', u'429 Too Many Requests, sleeping for 0.2s')) self.assertEquals({'Content-Type': 'text/html'}, resource_info.headers)
def setUp(self): CrawlerTestCase.setUp(self) # TODO: Refactor this testcase site = Site('http://example.org') self.resource_info = ResourceInfo() self.mapping = {'travel': 'TRAVEL', 'music': 'MUSIC'} subcategory = Field('subcategory', extractor=ConstantExtractor('travel')) category = Field('category', extractor=FieldMappingExtractor( 'subcategory', self.mapping)) self.config = Config( sites=[site], tika=None, solr=None, unique_field=None, url_field=None, last_modified_field=None, fields=[category, subcategory], )
def test_site_stores_attributes(self): url = 'http://example.org' attributes = {'name': 'My Site'} site = Site(url, attributes=attributes) self.assertEquals({'name': 'My Site'}, site.attributes)
def test_site_stores_url(self): url = 'http://example.org' site = Site(url) self.assertEquals(url, site.url)
def test_site_requires_url(self): with self.assertRaises(TypeError): Site()
} OBJECT_TYPE_MAPPING = { 'ContentPage': 'CONTENT_PAGE', 'File': 'FILE', } # May be overriden via command line arguments TIKA_URL = 'http://localhost:9998/' SOLR_URL = 'http://localhost:8983/solr' SLACK_TOKEN = 'this-is-a-slack-token' SLACK_CHANNEL = '#slack-channel' CONFIG = Config( sites=[ Site('https://www.sportamt-bern.ch/', attributes={'site_area': 'Sportamt Bern'}), Site('http://www.sitemapxml.co.uk/', attributes={'site_area': 'Sitemap XML'}), Site('http://www.pctipp.ch/', attributes={'site_area': 'PCtipp'}), Site('http://mailchimp.com', attributes={'site_area': 'MailChimp'}), Site('https://bgs.zg.ch', attributes={'site_area': 'Gesetzessammlung'}), ], unique_field='UID', url_field='path_string', last_modified_field='modified', fields=[ Field('allowedRolesAndUsers', extractor=ConstantExtractor(['Anonymous']), multivalued=True), Field('created', extractor=LastModifiedExtractor(), type_=datetime),
def setUp(self): SitemapTestCase.setUp(self) self.site = Site('http://example.org/') self.response = MockResponse(SITEMAP)
def setUp(self): super(SitemapTestCase, self).setUp() self.site = Site('http://example.org/')