예제 #1
0
def apply_extractors(descriptor, template_extractors, all_extractors):
    field_type_manager = FieldTypeManager()
    for eid in template_extractors or ():
        extractor_doc = all_extractors[eid]
        field_name = extractor_doc["field_name"]
        if not field_name in descriptor.attribute_map:
            descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
                field_name, field_name, field_type_manager.type_processor_class("text")
            )
        if "regular_expression" in extractor_doc:
            descriptor.attribute_map[field_name].extractor = create_regex_extractor(extractor_doc["regular_expression"])
        else:
            descriptor.attribute_map[field_name].extractor = getattr(ExtractorTypes, extractor_doc["builtin_extractor"])
예제 #2
0
 def test_regex_extractor(self):
     extractor = create_regex_extractor("(\d+).*(\.\d+)")
     extracted = extractor(u"The price of this product is <div>45</div> </div class='small'>.50</div> pounds")
     self.assertEqual(extracted, u"45.50")
     processor = TextFieldTypeProcessor()
     self.assertEqual(processor.adapt(extracted, None), u"45.50")