Пример #1
0
    def test_raises_if_attribute_not_found(self):
        site = Site('http://example.org')
        extractor = SiteAttributeExtractor('name')
        resource_info = ResourceInfo(site=site)

        with self.assertRaises(NoValueExtracted):
            extractor.extract_value(resource_info)
Пример #2
0
    def test_retrieves_attribute_from_site(self):
        site = Site('http://example.org', attributes={'name': 'My Site'})
        extractor = SiteAttributeExtractor('name')
        resource_info = ResourceInfo(site=site)

        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'My Site', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
Пример #3
0
    def test_applies_site_config_extractors_to_site(self):
        field = Field('EXAMPLE', extractor=SiteAttributeExtractor('name'))
        site = Site('http://example.org', attributes={'name': 'My Site'})
        resource_info = ResourceInfo(site=site)
        engine = self._create_engine(fields=[field],
                                     resource_info=resource_info)

        self.assertEquals({'EXAMPLE': u'My Site'},
                          engine.extract_field_values())
Пример #4
0
    def test_slack_can_format_json_from_data(self):
        ex = Exception('error message')
        site = Site('http://some-url.com/')

        data = json.loads(self.slacklogger.generateAttdata(ex, site))[0]

        self.assertEquals("http://some-url.com/",
                          data['fields'][0]['value'])

        self.assertEquals("Exception",
                          data['fields'][1]['value'])

        self.assertEquals("error message",
                          data['fields'][2]['value'])
Пример #5
0
    def setUp(self):
        CrawlerTestCase.setUp(self)
        self.site = Site('http://example.org')
        self.tika = 'http://localhost:9998'
        self.solr = 'http://localhost:8983/solr'
        self.slacktoken = 'token'
        self.slackchannel = '#channel'
        self.unique_field = 'UID'
        self.url_field = 'url'
        self.last_modified_field = 'modified'
        self.field = Field('foo', extractor=Extractor())

        self.config = Config([self.site], self.unique_field, self.url_field,
                             self.last_modified_field, [self.field], self.tika,
                             self.solr, self.slacktoken, self.slackchannel)
Пример #6
0
    def test_supports_absolute_sitemap_index_urls(self, request):
        responses = {
            'http://example.org/foo/bar/sitemap1.xml':
            MockResponse(status_code=200, content=SITEMAP_INDEX),
            'http://example.org/foo/bar/sitemap2.xml':
            MockResponse(status_code=200, content=SITEMAP_INDEX),
        }
        request.side_effect = lambda url, **kwargs: responses[url]

        site = Site('http://example.org/foo/',
                    sitemap_urls=[
                        'http://example.org/foo/bar/sitemap1.xml',
                        'http://example.org/foo/bar/sitemap2.xml'
                    ])
        sm_idx_fetcher = SitemapIndexFetcher(site)
        sitemap_index = sm_idx_fetcher.fetch()
        self.assertEquals(2, len(sitemap_index.sitemaps))
Пример #7
0
    def test_sleeps_and_retries_when_too_many_requests(self, request):
        responses = [
            MockResponse(status_code=429),
            MockResponse(status_code=429),
            MockResponse(content='', headers={'Content-Type': 'text/html'}),
        ]
        request.side_effect = lambda url, **kw: responses.pop(0)

        resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'},
                                     site=Site('http://example.org/'))
        fetcher = self._create_fetcher(resource_info)
        logging.disable(logging.INFO)
        with LogCapture() as log:
            resource_info = fetcher.fetch()

        log.check(('ftw.crawler.fetcher', 'WARNING',
                   u'429 Too Many Requests, sleeping for 0.1s'),
                  ('ftw.crawler.fetcher', 'WARNING',
                   u'429 Too Many Requests, sleeping for 0.2s'))

        self.assertEquals({'Content-Type': 'text/html'}, resource_info.headers)
Пример #8
0
    def setUp(self):
        CrawlerTestCase.setUp(self)
        # TODO: Refactor this testcase
        site = Site('http://example.org')
        self.resource_info = ResourceInfo()
        self.mapping = {'travel': 'TRAVEL', 'music': 'MUSIC'}

        subcategory = Field('subcategory',
                            extractor=ConstantExtractor('travel'))

        category = Field('category',
                         extractor=FieldMappingExtractor(
                             'subcategory', self.mapping))

        self.config = Config(
            sites=[site],
            tika=None,
            solr=None,
            unique_field=None,
            url_field=None,
            last_modified_field=None,
            fields=[category, subcategory],
        )
Пример #9
0
 def test_site_stores_attributes(self):
     url = 'http://example.org'
     attributes = {'name': 'My Site'}
     site = Site(url, attributes=attributes)
     self.assertEquals({'name': 'My Site'}, site.attributes)
Пример #10
0
 def test_site_stores_url(self):
     url = 'http://example.org'
     site = Site(url)
     self.assertEquals(url, site.url)
Пример #11
0
 def test_site_requires_url(self):
     with self.assertRaises(TypeError):
         Site()
Пример #12
0
}

OBJECT_TYPE_MAPPING = {
    'ContentPage': 'CONTENT_PAGE',
    'File': 'FILE',
}

# May be overriden via command line arguments
TIKA_URL = 'http://localhost:9998/'
SOLR_URL = 'http://localhost:8983/solr'
SLACK_TOKEN = 'this-is-a-slack-token'
SLACK_CHANNEL = '#slack-channel'

CONFIG = Config(
    sites=[
        Site('https://www.sportamt-bern.ch/',
             attributes={'site_area': 'Sportamt Bern'}),
        Site('http://www.sitemapxml.co.uk/',
             attributes={'site_area': 'Sitemap XML'}),
        Site('http://www.pctipp.ch/', attributes={'site_area': 'PCtipp'}),
        Site('http://mailchimp.com', attributes={'site_area': 'MailChimp'}),
        Site('https://bgs.zg.ch', attributes={'site_area':
                                              'Gesetzessammlung'}),
    ],
    unique_field='UID',
    url_field='path_string',
    last_modified_field='modified',
    fields=[
        Field('allowedRolesAndUsers',
              extractor=ConstantExtractor(['Anonymous']),
              multivalued=True),
        Field('created', extractor=LastModifiedExtractor(), type_=datetime),
Пример #13
0
 def setUp(self):
     SitemapTestCase.setUp(self)
     self.site = Site('http://example.org/')
     self.response = MockResponse(SITEMAP)
Пример #14
0
 def setUp(self):
     super(SitemapTestCase, self).setUp()
     self.site = Site('http://example.org/')