示例#1
0
    def test_handles_non_ascii_content(self):
        extractor = SnippetTextExtractor()

        # Both text and title unicode
        resource_info = ResourceInfo(metadata={'title': u'B\xe4ren'},
                                     text=u'B\xe4rengraben',
                                     headers={})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'graben', extracted_value)
        self.assertIsInstance(extracted_value, unicode)

        # Both text and title utf-8
        resource_info = ResourceInfo(metadata={'title': 'B\xc3\xa4ren'},
                                     text='B\xc3\xa4rengraben',
                                     headers={})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'graben', extracted_value)
        self.assertIsInstance(extracted_value, unicode)

        # Mix of unicode and utf-8
        resource_info = ResourceInfo(metadata={'title': u'B\xe4ren'},
                                     text='B\xc3\xa4rengraben',
                                     headers={})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'graben', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#2
0
    def test_uid_is_different_for_different_urls(self):
        extractor = UIDExtractor()

        resource_info = ResourceInfo(url_info={'loc': 'http://example.org'})
        uid1 = extractor.extract_value(resource_info)

        resource_info = ResourceInfo(
            url_info={'loc': 'http://example.org/foo'})
        uid2 = extractor.extract_value(resource_info)

        self.assertNotEqual(uid1, uid2)
示例#3
0
    def test_maps_header_to_value(self):
        mapping = {'text/html': 'HTML', 'image/png': 'IMAGE'}
        extractor = HeaderMappingExtractor('content-type', mapping)

        resource_info = ResourceInfo(headers={'content-type': 'text/html'})
        extracted_value = extractor.extract_value(resource_info)
        self.assertEquals('HTML', extracted_value)
        self.assertIsInstance(extracted_value, unicode)

        resource_info = ResourceInfo(headers={'content-type': 'image/png'})
        extracted_value = extractor.extract_value(resource_info)
        self.assertEquals('IMAGE', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#4
0
    def test_returns_given_text(self):
        extractor = PlainTextExtractor()
        resource_info = ResourceInfo(text=u'foobar')
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'foobar', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#5
0
    def test_sets_text_from_converter_on_resource_info(self):
        converter = MagicMock()
        converter.extract_text = MagicMock(return_value=u'foo bar')
        resource_info = ResourceInfo()

        self._create_engine(resource_info=resource_info, converter=converter)
        self.assertEquals(u'foo bar', resource_info.text)
示例#6
0
    def test_set_metadata_from_converter_on_resource_info(self):
        converter = MagicMock()
        converter.extract_metadata = MagicMock(return_value={'foo': 'bar'})
        resource_info = ResourceInfo()

        self._create_engine(resource_info=resource_info, converter=converter)
        self.assertEquals({'foo': 'bar'}, resource_info.metadata)
示例#7
0
    def test_raises_if_redirect(self, request):
        request.return_value = MockResponse(status_code=301, is_redirect=True)

        resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'})
        fetcher = self._create_fetcher(resource_info)
        with self.assertRaises(AttemptedRedirect):
            fetcher.fetch()
示例#8
0
    def test_extracts_title_from_metadata(self):
        extractor = TitleExtractor()
        resource_info = ResourceInfo(metadata={'title': u'value'}, headers={})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'value', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#9
0
 def test_falls_back_to_http_last_modified(self):
     extractor = LastModifiedExtractor()
     resource_info = ResourceInfo(
         url_info={},
         headers={'last-modified': 'Wed, 31 Dec 2014 15:45:30 GMT'})
     self.assertEquals(to_utc(datetime(2014, 12, 31, 15, 45, 30)),
                       extractor.extract_value(resource_info))
示例#10
0
    def test_extracts_creator(self):
        extractor = CreatorExtractor()
        resource_info = ResourceInfo(metadata={'creator': 'John Doe'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'John Doe', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#11
0
    def test_returns_constant_value(self):
        extractor = ConstantExtractor(42)
        field = Field('example', extractor)
        extractor.bind(field)
        resource_info = ResourceInfo()

        self.assertEquals(42, extractor.extract_value(resource_info))
示例#12
0
    def test_uses_default_if_header_not_mapped(self):
        extractor = HeaderMappingExtractor('pragma', {}, default='DEFAULT')
        resource_info = ResourceInfo(headers={'pragma': 'no-cache'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals('DEFAULT', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#13
0
    def test_extracts_url_from_urlinfo(self):
        extractor = URLExtractor()
        resource_info = ResourceInfo(url_info={'loc': 'http://example.org'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'http://example.org', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#14
0
    def test_defaults_to_loc_if_no_target_given(self):
        extractor = TargetURLExtractor()
        resource_info = ResourceInfo(url_info={'loc': 'http://example.org'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'http://example.org', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#15
0
    def test_raises_if_attribute_not_found(self):
        site = Site('http://example.org')
        extractor = SiteAttributeExtractor('name')
        resource_info = ResourceInfo(site=site)

        with self.assertRaises(NoValueExtracted):
            extractor.extract_value(resource_info)
示例#16
0
    def test_defaults_to_index_html_for_empty_basename(self):
        extractor = SlugExtractor()
        resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'index-html', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#17
0
    def test_extracts_description(self):
        extractor = DescriptionExtractor()
        resource_info = ResourceInfo(metadata={'description': 'value'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'value', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#18
0
    def test_raises_if_not_200_ok(self, request):
        request.return_value = MockResponse(status_code=404)

        resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'})
        fetcher = self._create_fetcher(resource_info)
        with self.assertRaises(FetchingError):
            fetcher.fetch()
示例#19
0
    def test_lastmod_from_urlinfo(self):
        extractor = LastModifiedExtractor()
        resource_info = ResourceInfo(
            url_info={'lastmod': '2014-12-31T16:45:30+01:00'})

        self.assertEquals(to_utc(datetime(2014, 12, 31, 15, 45, 30)),
                          extractor.extract_value(resource_info))
示例#20
0
    def test_applies_urlinfo_extractors_to_urlinfo(self):
        field = Field('EXAMPLE', extractor=ExampleURLInfoExtractor())
        resource_info = ResourceInfo(url_info={'loc': 'http://example.org'})
        engine = self._create_engine(resource_info=resource_info,
                                     fields=[field])

        self.assertEquals({'EXAMPLE': u'http://example.org'},
                          engine.extract_field_values())
示例#21
0
    def test_deals_with_non_ascii_characters_unicode(self):
        extractor = SlugExtractor()
        resource_info = ResourceInfo(
            url_info={'loc': u'http://example.org/b\xe4rengraben'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'barengraben', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#22
0
    def test_deals_with_url_encoding(self):
        extractor = SlugExtractor()
        resource_info = ResourceInfo(
            url_info={'loc': 'http://example.org/foo%%20bar'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'foo-bar', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#23
0
    def test_deals_with_trailing_slash(self):
        extractor = SlugExtractor()
        resource_info = ResourceInfo(
            url_info={'loc': 'http://example.org/foo/bar/'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'bar', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#24
0
    def test_builds_uid_based_on_url(self):
        extractor = UIDExtractor()
        resource_info = ResourceInfo(url_info={'loc': 'http://example.org'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'dab521de-65f9-250b-4cca-7383feef67dc',
                          extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#25
0
    def test_uses_default_if_header_not_found(self):
        extractor = HeaderMappingExtractor('content-type', {},
                                           default='DEFAULT')
        resource_info = ResourceInfo(headers={})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals('DEFAULT', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#26
0
    def test_applies_http_header_extractors_to_headers(self):
        field = Field('EXAMPLE',
                      extractor=ExampleHTTPHeaderExtractor('example-header'))
        resource_info = ResourceInfo(headers={'example-header': 'value'})
        engine = self._create_engine(fields=[field],
                                     resource_info=resource_info)

        self.assertEquals({'EXAMPLE': u'value'}, engine.extract_field_values())
示例#27
0
    def test_equals_basename_for_simple_urls(self):
        extractor = SlugExtractor()
        resource_info = ResourceInfo(
            url_info={'loc': 'http://example.org/foo/bar'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals(u'bar', extracted_value)
        self.assertIsInstance(extracted_value, unicode)
示例#28
0
 def _create_resource(self, asset_name):
     doc_fn = resource_filename('ftw.crawler.tests.assets', asset_name)
     resource_info = ResourceInfo(metadata={},
                                  url_info={'loc': 'http//example.org'},
                                  headers={},
                                  filename=doc_fn,
                                  content_type='text/html')
     return resource_info
示例#29
0
    def test_extracts_whitespace_separated_keywords(self):
        extractor = KeywordsExtractor()
        resource_info = ResourceInfo(metadata={'keywords': u'Foo Bar     Baz'})
        extracted_value = extractor.extract_value(resource_info)

        self.assertEquals([u'Foo', u'Bar', u'Baz'], extracted_value)
        for item in extracted_value:
            self.assertIsInstance(item, unicode)
示例#30
0
    def test_returns_http_headers(self, request):
        request.return_value = MockResponse(
            content='', headers={'Content-Type': 'text/html'})
        resource_info = ResourceInfo(url_info={'loc': 'http://example.org/'})
        fetcher = self._create_fetcher(resource_info=resource_info)
        resource_info = fetcher.fetch()

        self.assertEquals({'Content-Type': 'text/html'}, resource_info.headers)