def apply_extractors(descriptor, template_extractors, all_extractors): field_type_manager = FieldTypeManager() for eid in template_extractors or (): extractor_doc = all_extractors[eid] field_name = extractor_doc["field_name"] if not field_name in descriptor.attribute_map: descriptor.attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, field_type_manager.type_processor_class("text") ) if "regular_expression" in extractor_doc: descriptor.attribute_map[field_name].extractor = create_regex_extractor(extractor_doc["regular_expression"]) else: descriptor.attribute_map[field_name].extractor = getattr(ExtractorTypes, extractor_doc["builtin_extractor"])
def test_regex_extractor(self): extractor = create_regex_extractor("(\d+).*(\.\d+)") extracted = extractor(u"The price of this product is <div>45</div> </div class='small'>.50</div> pounds") self.assertEqual(extracted, u"45.50") processor = TextFieldTypeProcessor() self.assertEqual(processor.adapt(extracted, None), u"45.50")