def test_returns_constant_value(self): extractor = ConstantExtractor(42) field = Field('example', extractor) extractor.bind(field) resource_info = ResourceInfo() self.assertEquals(42, extractor.extract_value(resource_info))
def test_returns_unicode_for_string_constant(self): extractor = ConstantExtractor('foo') field = Field('example', extractor) extractor.bind(field) resource_info = ResourceInfo() extracted_value = extractor.extract_value(resource_info) self.assertEquals(u'foo', extracted_value) self.assertIsInstance(extracted_value, unicode)
def test_returns_unicode_for_multivalued_string_constant(self): extractor = ConstantExtractor(['foo', 'bar']) field = Field('example', extractor, multivalued=True) extractor.bind(field) resource_info = ResourceInfo() extracted_value = extractor.extract_value(resource_info) self.assertEquals([u'foo', u'bar'], extracted_value) for item in extracted_value: self.assertIsInstance(item, unicode)
def test_raises_if_no_default_and_field_value_not_mapped(self): category = self.config.get_field('category') subcategory = self.config.get_field('subcategory') subcategory.extractor = ConstantExtractor('physics') subcategory.extractor.bind(subcategory) with self.assertRaises(NoValueExtracted): category.extractor.extract_value(self.resource_info)
def test_raises_if_no_default_and_field_doesnt_return_value(self): category = self.config.get_field('category') subcategory = self.config.get_field('subcategory') subcategory.extractor = ConstantExtractor(None) subcategory.extractor.bind(subcategory) with self.assertRaises(NoValueExtracted): category.extractor.extract_value(self.resource_info)
def test_asserts_proper_type_for_multivalued_extractors(self): field = Field('int_field', extractor=ConstantExtractor([42]), type_=int, multivalued=True) engine = self._create_engine(fields=[field]) self.assertEquals({'int_field': [42]}, engine.extract_field_values())
def test_asserts_proper_type_for_extractors(self): field = Field('int_field', extractor=ConstantExtractor('foo'), type_=int) engine = self._create_engine(fields=[field]) with self.assertRaises(ExtractionError): engine.extract_field_values()
def test_uses_default_if_field_value_not_mapped(self): category = self.config.get_field('category') category.extractor.default = 'DEFAULT' subcategory = self.config.get_field('subcategory') subcategory.extractor = ConstantExtractor('physics') subcategory.extractor.bind(subcategory) extracted_value = category.extractor.extract_value(self.resource_info) self.assertEquals(u'DEFAULT', extracted_value) self.assertIsInstance(extracted_value, unicode)
def setUp(self): CrawlerTestCase.setUp(self) # TODO: Refactor this testcase site = Site('http://example.org') self.resource_info = ResourceInfo() self.mapping = {'travel': 'TRAVEL', 'music': 'MUSIC'} subcategory = Field('subcategory', extractor=ConstantExtractor('travel')) category = Field('category', extractor=FieldMappingExtractor( 'subcategory', self.mapping)) self.config = Config( sites=[site], tika=None, solr=None, unique_field=None, url_field=None, last_modified_field=None, fields=[category, subcategory], )
sites=[ Site('https://www.sportamt-bern.ch/', attributes={'site_area': 'Sportamt Bern'}), Site('http://www.sitemapxml.co.uk/', attributes={'site_area': 'Sitemap XML'}), Site('http://www.pctipp.ch/', attributes={'site_area': 'PCtipp'}), Site('http://mailchimp.com', attributes={'site_area': 'MailChimp'}), Site('https://bgs.zg.ch', attributes={'site_area': 'Gesetzessammlung'}), ], unique_field='UID', url_field='path_string', last_modified_field='modified', fields=[ Field('allowedRolesAndUsers', extractor=ConstantExtractor(['Anonymous']), multivalued=True), Field('created', extractor=LastModifiedExtractor(), type_=datetime), Field('Creator', extractor=CreatorExtractor()), Field('Description', extractor=DescriptionExtractor()), Field('effective', extractor=IndexingTimeExtractor(), type_=datetime), Field('expires', extractor=ConstantExtractor(datetime(2050, 12, 31)), type_=datetime), Field('getId', extractor=SlugExtractor()), Field('getRemoteUrl', extractor=TargetURLExtractor()), Field('modified', extractor=LastModifiedExtractor(), type_=datetime), Field('object_type', extractor=FieldMappingExtractor('portal_type', OBJECT_TYPE_MAPPING, default='File')),