def apply_extractors(descriptor, template_extractors, extractors): field_type_manager = FieldTypeManager() for field_name, field_extractors in template_extractors.items(): equeue = [] for eid in field_extractors: extractor_doc = extractors[eid] if "regular_expression" in extractor_doc: equeue.append( create_regex_extractor( extractor_doc["regular_expression"])) elif "type_extractor" in extractor_doc: # overrides default one descriptor.attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, field_type_manager.type_processor_class( extractor_doc["type_extractor"])()) if not field_name in descriptor.attribute_map: # if not defined type extractor, use text type by default, as it is by far the most commonly used descriptor.attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, field_type_manager.type_processor_class("text")()) if equeue: equeue.insert(0, descriptor.attribute_map[field_name].extractor) descriptor.attribute_map[field_name].extractor = PipelineExtractor( *equeue)
def _load_extractors(self, field, schema, modifiers): field, _meta = self._field, self._meta extractors = [] try: field_extraction = schema.attribute_map.get(field) except AttributeError: field_extraction = None if field_extraction is None: field_extraction = SlybotFieldDescriptor(field, field, _DEFAULT_EXTRACTOR) if u'pre_text' in _meta or u'post_text' in _meta: text_extractor = TextRegionDataExtractor( _meta.get(u'pre_text', u''), _meta.get(u'post_text', u'')) field_extraction = copy.deepcopy(field_extraction) field_extraction.extractor = _compose( field_extraction.extractor, text_extractor.extract) extractors = _meta.get(u'extractors', []) if isinstance(extractors, dict): extractors = extractors.get(field, []) adaptors = [] for extractor in extractors: if extractor in modifiers: adaptors.append(modifiers[extractor]) return field_extraction, adaptors
def apply_extractors(descriptor, template_extractors, extractors): type_processor_class = FieldTypeManager().type_processor_class if isinstance(template_extractors, dict): template_extractors = template_extractors.items() attribute_map = descriptor.attribute_map for field_name, field_extractors in template_extractors: equeue = [] for eid in field_extractors: e_doc = extractors.get(eid, {}) if "regular_expression" in e_doc: equeue.append( create_regex_extractor(e_doc["regular_expression"])) elif "type_extractor" in e_doc: # overrides default one try: display_name = attribute_map[field_name].description except KeyError: display_name = field_name field_type = type_processor_class(e_doc["type_extractor"])() attribute_map[field_name] = SlybotFieldDescriptor( field_name, display_name, field_type) if field_name not in attribute_map: # if not defined type extractor, use text type by default, as it is # by far the most commonly used attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, type_processor_class("text")()) if equeue: equeue.insert(0, attribute_map[field_name].extractor) attribute_map[field_name].extractor = PipelineExtractor(*equeue)
def apply_extractors(descriptor, template_extractors, extractors): type_processor_class = FieldTypeManager().type_processor_class if isinstance(template_extractors, dict): template_extractors = template_extractors.items() attribute_map = descriptor.attribute_map for field_name, field_extractors in template_extractors: equeue = [] for eid in field_extractors: e_doc = extractors.get(eid, {}) if "regular_expression" in e_doc: equeue.append( create_regex_extractor(e_doc["regular_expression"])) elif "type_extractor" in e_doc: # overrides default one try: display_name = attribute_map[field_name].description except KeyError: display_name = field_name field_type = type_processor_class(e_doc["type_extractor"])() attribute_map[field_name] = SlybotFieldDescriptor( field_name, display_name, field_type) if field_name not in attribute_map: # if not defined type extractor, use text type by default, as it is # by far the most commonly used attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, type_processor_class("text")()) if equeue: equeue.insert(0, attribute_map[field_name].extractor) attribute_map[field_name].extractor = PipelineExtractor(*equeue)
def _process_fields(self, annotations, regions, htmlpage): for annotation in arg_to_iter(annotations): if isinstance(annotation, dict): field = annotation['field'] try: field_extraction = self.schema.attribute_map.get(field) except AttributeError: field_extraction = None if field_extraction is None: field_extraction = SlybotFieldDescriptor( field, field, _DEFAULT_EXTRACTOR) if annotation.get('pre_text') or annotation.get('post_text'): text_extractor = TextRegionDataExtractor( annotation.get('pre_text', ''), annotation.get('post_text', '')) field_extraction = copy.deepcopy(field_extraction) field_extraction.extractor = _compose( field_extraction.extractor, text_extractor.extract) extracted = self._process_values(regions, htmlpage, field_extraction) for extractor in annotation.get('extractors', []): custom_extractor_func = self.modifiers.get(extractor) if custom_extractor_func and extracted: extracted = [ custom_extractor_func(s, htmlpage) for s in extracted ] if annotation.get('required') and not extracted: raise MissingRequiredError() yield (field_extraction, extracted) else: # Legacy spiders have per attribute pipline extractors if self.legacy and annotation == 'variants': yield (annotation, self._process_variants(regions, htmlpage)) continue try: extraction_func = self.schema.attribute_map.get(annotation) except AttributeError: extraction_func = None if extraction_func is None: extraction_func = SlybotFieldDescriptor( annotation, annotation, _DEFAULT_EXTRACTOR) values = self._process_values(regions, htmlpage, extraction_func) yield (extraction_func, values)
def _process_fields(self, annotations, regions, htmlpage): for annotation in arg_to_iter(annotations): if isinstance(annotation, dict): field = annotation['field'] try: field_extraction = self.schema.attribute_map.get(field) except AttributeError: field_extraction = None if field_extraction is None: field_extraction = SlybotFieldDescriptor( field, field, _DEFAULT_EXTRACTOR) if annotation.get('pre_text') or annotation.get('post_text'): text_extractor = TextRegionDataExtractor( annotation.get('pre_text', ''), annotation.get('post_text', '')) field_extraction = copy.deepcopy(field_extraction) field_extraction.extractor = _compose( field_extraction.extractor, text_extractor.extract) extracted = self._process_values( regions, htmlpage, field_extraction ) for extractor in annotation.get('extractors', []): custom_extractor_func = self.modifiers.get(extractor) if custom_extractor_func and extracted: extracted = [custom_extractor_func(s, htmlpage) for s in extracted] if annotation.get('required') and not extracted: raise MissingRequiredError() yield (field_extraction, extracted) else: # Legacy spiders have per attribute pipline extractors if self.legacy and annotation == 'variants': yield (annotation, self._process_variants(regions, htmlpage)) continue try: extraction_func = self.schema.attribute_map.get(annotation) except AttributeError: extraction_func = None if extraction_func is None: extraction_func = SlybotFieldDescriptor( annotation, annotation, _DEFAULT_EXTRACTOR) values = self._process_values(regions, htmlpage, extraction_func) yield (extraction_func, values)
def _process_fields(self, annotations, regions, htmlpage): for annotation in arg_to_iter(annotations): if isinstance(annotation, dict): field = annotation['field'] try: field_extraction = self.schema.attribute_map.get(field) except AttributeError: field_extraction = None if field_extraction is None: field_extraction = SlybotFieldDescriptor( '', '', _DEFAULT_EXTRACTOR) if annotation.get('pre_text') or annotation.get('post_text'): text_extractor = TextRegionDataExtractor( annotation.get('pre_text', ''), annotation.get('post_text', '')) field_extraction.extractor = _compose( field_extraction.extractor, text_extractor) extracted = self._process_values( regions, htmlpage, field_extraction ) for extractor in annotation.get('extractors', []): custom_extractor_func = self.modifiers.get(extractor) if custom_extractor_func and extracted: extracted = custom_extractor_func(extracted, htmlpage) if annotation.get('required') and not extracted: raise MissingRequiredError() if field_extraction.name != field_extraction.description: field = field_extraction.description yield (field, extracted) else: # Legacy spiders have per attribute pipline extractors try: extraction_func = self.schema.attribute_map.get(annotation) except AttributeError: extraction_func = None if extraction_func is None: extraction_func = SlybotFieldDescriptor( '', '', _DEFAULT_EXTRACTOR) values = self._process_values(regions, htmlpage, extraction_func) if extraction_func.name != extraction_func.description: annotation = extraction_func.description yield (annotation, values)