Exemplo n.º 1
0
def apply_extractors(descriptor, template_extractors, extractors):
    field_type_manager = FieldTypeManager()

    for field_name, field_extractors in template_extractors.items():
        equeue = []
        for eid in field_extractors:
            extractor_doc = extractors[eid]
            if "regular_expression" in extractor_doc:
                equeue.append(
                    create_regex_extractor(
                        extractor_doc["regular_expression"]))
            elif "type_extractor" in extractor_doc:  # overrides default one
                descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
                    field_name, field_name,
                    field_type_manager.type_processor_class(
                        extractor_doc["type_extractor"])())
        if not field_name in descriptor.attribute_map:
            # if not defined type extractor, use text type by default, as it is by far the most commonly used
            descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
                field_name, field_name,
                field_type_manager.type_processor_class("text")())

        if equeue:
            equeue.insert(0, descriptor.attribute_map[field_name].extractor)
            descriptor.attribute_map[field_name].extractor = PipelineExtractor(
                *equeue)
Exemplo n.º 2
0
 def _load_extractors(self, field, schema, modifiers):
     field, _meta = self._field, self._meta
     extractors = []
     try:
         field_extraction = schema.attribute_map.get(field)
     except AttributeError:
         field_extraction = None
     if field_extraction is None:
         field_extraction = SlybotFieldDescriptor(field, field,
                                                  _DEFAULT_EXTRACTOR)
     if u'pre_text' in _meta or u'post_text' in _meta:
         text_extractor = TextRegionDataExtractor(
             _meta.get(u'pre_text', u''),
             _meta.get(u'post_text', u''))
         field_extraction = copy.deepcopy(field_extraction)
         field_extraction.extractor = _compose(
             field_extraction.extractor, text_extractor.extract)
     extractors = _meta.get(u'extractors', [])
     if isinstance(extractors, dict):
         extractors = extractors.get(field, [])
     adaptors = []
     for extractor in extractors:
         if extractor in modifiers:
             adaptors.append(modifiers[extractor])
     return field_extraction, adaptors
Exemplo n.º 3
0
def apply_extractors(descriptor, template_extractors, extractors):
    type_processor_class = FieldTypeManager().type_processor_class
    if isinstance(template_extractors, dict):
        template_extractors = template_extractors.items()
    attribute_map = descriptor.attribute_map
    for field_name, field_extractors in template_extractors:
        equeue = []
        for eid in field_extractors:
            e_doc = extractors.get(eid, {})
            if "regular_expression" in e_doc:
                equeue.append(
                    create_regex_extractor(e_doc["regular_expression"]))
            elif "type_extractor" in e_doc:  # overrides default one
                try:
                    display_name = attribute_map[field_name].description
                except KeyError:
                    display_name = field_name
                field_type = type_processor_class(e_doc["type_extractor"])()
                attribute_map[field_name] = SlybotFieldDescriptor(
                    field_name, display_name, field_type)
        if field_name not in attribute_map:
            # if not defined type extractor, use text type by default, as it is
            # by far the most commonly used
            attribute_map[field_name] = SlybotFieldDescriptor(
                field_name, field_name,
                type_processor_class("text")())

        if equeue:
            equeue.insert(0, attribute_map[field_name].extractor)
            attribute_map[field_name].extractor = PipelineExtractor(*equeue)
Exemplo n.º 4
0
def apply_extractors(descriptor, template_extractors, extractors):
    type_processor_class = FieldTypeManager().type_processor_class
    if isinstance(template_extractors, dict):
        template_extractors = template_extractors.items()
    attribute_map = descriptor.attribute_map
    for field_name, field_extractors in template_extractors:
        equeue = []
        for eid in field_extractors:
            e_doc = extractors.get(eid, {})
            if "regular_expression" in e_doc:
                equeue.append(
                    create_regex_extractor(e_doc["regular_expression"]))
            elif "type_extractor" in e_doc:  # overrides default one
                try:
                    display_name = attribute_map[field_name].description
                except KeyError:
                    display_name = field_name
                field_type = type_processor_class(e_doc["type_extractor"])()
                attribute_map[field_name] = SlybotFieldDescriptor(
                    field_name, display_name, field_type)
        if field_name not in attribute_map:
            # if not defined type extractor, use text type by default, as it is
            # by far the most commonly used
            attribute_map[field_name] = SlybotFieldDescriptor(
                field_name, field_name,
                type_processor_class("text")())

        if equeue:
            equeue.insert(0, attribute_map[field_name].extractor)
            attribute_map[field_name].extractor = PipelineExtractor(*equeue)
Exemplo n.º 5
0
 def _process_fields(self, annotations, regions, htmlpage):
     for annotation in arg_to_iter(annotations):
         if isinstance(annotation, dict):
             field = annotation['field']
             try:
                 field_extraction = self.schema.attribute_map.get(field)
             except AttributeError:
                 field_extraction = None
             if field_extraction is None:
                 field_extraction = SlybotFieldDescriptor(
                     field, field, _DEFAULT_EXTRACTOR)
             if annotation.get('pre_text') or annotation.get('post_text'):
                 text_extractor = TextRegionDataExtractor(
                     annotation.get('pre_text', ''),
                     annotation.get('post_text', ''))
                 field_extraction = copy.deepcopy(field_extraction)
                 field_extraction.extractor = _compose(
                     field_extraction.extractor, text_extractor.extract)
             extracted = self._process_values(regions, htmlpage,
                                              field_extraction)
             for extractor in annotation.get('extractors', []):
                 custom_extractor_func = self.modifiers.get(extractor)
                 if custom_extractor_func and extracted:
                     extracted = [
                         custom_extractor_func(s, htmlpage)
                         for s in extracted
                     ]
             if annotation.get('required') and not extracted:
                 raise MissingRequiredError()
             yield (field_extraction, extracted)
         else:
             # Legacy spiders have per attribute pipline extractors
             if self.legacy and annotation == 'variants':
                 yield (annotation,
                        self._process_variants(regions, htmlpage))
                 continue
             try:
                 extraction_func = self.schema.attribute_map.get(annotation)
             except AttributeError:
                 extraction_func = None
             if extraction_func is None:
                 extraction_func = SlybotFieldDescriptor(
                     annotation, annotation, _DEFAULT_EXTRACTOR)
             values = self._process_values(regions, htmlpage,
                                           extraction_func)
             yield (extraction_func, values)
Exemplo n.º 6
0
 def _process_fields(self, annotations, regions, htmlpage):
     for annotation in arg_to_iter(annotations):
         if isinstance(annotation, dict):
             field = annotation['field']
             try:
                 field_extraction = self.schema.attribute_map.get(field)
             except AttributeError:
                 field_extraction = None
             if field_extraction is None:
                 field_extraction = SlybotFieldDescriptor(
                     field, field, _DEFAULT_EXTRACTOR)
             if annotation.get('pre_text') or annotation.get('post_text'):
                 text_extractor = TextRegionDataExtractor(
                     annotation.get('pre_text', ''),
                     annotation.get('post_text', ''))
                 field_extraction = copy.deepcopy(field_extraction)
                 field_extraction.extractor = _compose(
                     field_extraction.extractor, text_extractor.extract)
             extracted = self._process_values(
                 regions, htmlpage, field_extraction
             )
             for extractor in annotation.get('extractors', []):
                 custom_extractor_func = self.modifiers.get(extractor)
                 if custom_extractor_func and extracted:
                     extracted = [custom_extractor_func(s, htmlpage)
                                  for s in extracted]
             if annotation.get('required') and not extracted:
                 raise MissingRequiredError()
             yield (field_extraction, extracted)
         else:
             # Legacy spiders have per attribute pipline extractors
             if self.legacy and annotation == 'variants':
                 yield (annotation, self._process_variants(regions,
                                                           htmlpage))
                 continue
             try:
                 extraction_func = self.schema.attribute_map.get(annotation)
             except AttributeError:
                 extraction_func = None
             if extraction_func is None:
                 extraction_func = SlybotFieldDescriptor(
                     annotation, annotation, _DEFAULT_EXTRACTOR)
             values = self._process_values(regions, htmlpage,
                                           extraction_func)
             yield (extraction_func, values)
Exemplo n.º 7
0
 def _process_fields(self, annotations, regions, htmlpage):
     for annotation in arg_to_iter(annotations):
         if isinstance(annotation, dict):
             field = annotation['field']
             try:
                 field_extraction = self.schema.attribute_map.get(field)
             except AttributeError:
                 field_extraction = None
             if field_extraction is None:
                 field_extraction = SlybotFieldDescriptor(
                     '', '', _DEFAULT_EXTRACTOR)
             if annotation.get('pre_text') or annotation.get('post_text'):
                 text_extractor = TextRegionDataExtractor(
                     annotation.get('pre_text', ''),
                     annotation.get('post_text', ''))
                 field_extraction.extractor = _compose(
                     field_extraction.extractor, text_extractor)
             extracted = self._process_values(
                 regions, htmlpage, field_extraction
             )
             for extractor in annotation.get('extractors', []):
                 custom_extractor_func = self.modifiers.get(extractor)
                 if custom_extractor_func and extracted:
                     extracted = custom_extractor_func(extracted, htmlpage)
             if annotation.get('required') and not extracted:
                 raise MissingRequiredError()
             if field_extraction.name != field_extraction.description:
                 field = field_extraction.description
             yield (field, extracted)
         else:
             # Legacy spiders have per attribute pipline extractors
             try:
                 extraction_func = self.schema.attribute_map.get(annotation)
             except AttributeError:
                 extraction_func = None
             if extraction_func is None:
                 extraction_func = SlybotFieldDescriptor(
                     '', '', _DEFAULT_EXTRACTOR)
             values = self._process_values(regions, htmlpage,
                                           extraction_func)
             if extraction_func.name != extraction_func.description:
                 annotation = extraction_func.description
             yield (annotation, values)