def apply_extractors(descriptor, template_extractors, extractors): field_type_manager = FieldTypeManager() for field_name, field_extractors in template_extractors.items(): equeue = [] for eid in field_extractors: extractor_doc = extractors[eid] if "regular_expression" in extractor_doc: equeue.append( create_regex_extractor( extractor_doc["regular_expression"])) elif "type_extractor" in extractor_doc: # overrides default one descriptor.attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, field_type_manager.type_processor_class( extractor_doc["type_extractor"])()) if not field_name in descriptor.attribute_map: # if not defined type extractor, use text type by default, as it is by far the most commonly used descriptor.attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, field_type_manager.type_processor_class("text")()) if equeue: equeue.insert(0, descriptor.attribute_map[field_name].extractor) descriptor.attribute_map[field_name].extractor = PipelineExtractor( *equeue)
def create_slybot_item_descriptor(schema): field_type_manager = FieldTypeManager() descriptors = [] for pname, pdict in schema['fields'].items(): required = pdict['required'] pclass = field_type_manager.type_processor_class(pdict['type']) processor = pclass() descriptor = SlybotFieldDescriptor(pname, pname, processor, required) descriptors.append(descriptor) return ItemDescriptor("", "", descriptors)
def create_type_extractor(_type): types = FieldTypeManager() extractor = types.type_processor_class(_type)() def _extractor(txt, htmlpage=None): data = extractor.extractor(txt) if data: return extractor.adapt(data, htmlpage) _extractor.__name__ = ("Type Extractor: %s" % _type).encode('utf-8') return _extractor
def create_slybot_item_descriptor(schema): field_type_manager = FieldTypeManager() if schema is None: schema = {"id": "item", "properties": ()} descriptors = [] for pname, pdict in schema.get("properties", ()): description = pdict.get("description") required = not pdict.get("optional", True) pclass = field_type_manager.type_processor_class(pdict.get("type")) processor = pclass() descriptor = SlybotFieldDescriptor(pname, description, processor, required) descriptors.append(descriptor) return ItemDescriptor(schema["id"], schema.get("description"), descriptors)
def create_slybot_item_descriptor(schema): field_type_manager = FieldTypeManager() if schema is None: schema = {'id': 'item', 'properties': ()} descriptors = [] for pname, pdict in schema.get('properties', ()): description = pdict.get('description') required = not pdict.get('optional', True) pclass = field_type_manager.type_processor_class(pdict.get('type')) processor = pclass() descriptor = SlybotFieldDescriptor(pname, description, processor, required) descriptors.append(descriptor) return ItemDescriptor(schema['id'], schema.get('description'), descriptors)
def create_slybot_item_descriptor(schema, schema_name=""): field_type_manager = FieldTypeManager() descriptors = [] for pname, pdict in schema.get('fields', {}).items(): required = pdict['required'] pdisplay_name = pdict.get('name', pname) pclass = field_type_manager.type_processor_class(pdict['type']) processor = pclass() descriptor = SlybotFieldDescriptor(pname, pdisplay_name, processor, required) descriptors.append(descriptor) return SlybotItemDescriptor(schema_name, schema.get('name', schema_name), descriptors)
def apply_extractors(descriptor, template_extractors, all_extractors): field_type_manager = FieldTypeManager() for eid in template_extractors or (): extractor_doc = all_extractors[eid] field_name = extractor_doc["field_name"] if not field_name in descriptor.attribute_map: descriptor.attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, field_type_manager.type_processor_class("text") ) if "regular_expression" in extractor_doc: descriptor.attribute_map[field_name].extractor = create_regex_extractor(extractor_doc["regular_expression"]) else: descriptor.attribute_map[field_name].extractor = getattr(ExtractorTypes, extractor_doc["builtin_extractor"])
def create_slybot_item_descriptor(schema, schema_name=""): field_type_manager = FieldTypeManager() descriptors = [] for pname, pdict in schema['fields'].items(): required = pdict['required'] pdisplay_name = pdict.get('name', pname) pclass = field_type_manager.type_processor_class(pdict['type']) processor = pclass() descriptor = SlybotFieldDescriptor(pname, pdisplay_name, processor, required) descriptors.append(descriptor) return SlybotItemDescriptor(schema_name, schema.get('name', schema_name), descriptors)
def create_type_extractor(_type): types = FieldTypeManager() extractor = types.type_processor_class(_type)() def _extractor(txt, htmlpage=None): if txt is None: return page = getattr(htmlpage, 'htmlpage', htmlpage) if not hasattr(txt, 'text_content'): txt = HtmlPageRegion(page, txt) data = extractor.extract(txt) if data: return extractor.adapt(data, page) _extractor.__name__ = ("Type Extractor: %s" % _type) return _extractor
def create_type_extractor(_type): types = FieldTypeManager() extractor = types.type_processor_class(_type)() def _extractor(txt, htmlpage=None): if txt is None: return page = getattr(htmlpage, 'htmlpage', htmlpage) if not hasattr(txt, 'text_content'): txt = HtmlPageRegion(page, txt) data = extractor.extract(txt) if data: return extractor.adapt(data, page) _extractor.__name__ = ("Type Extractor: %s" % _type).encode('utf-8') return _extractor
def apply_extractors(descriptor, template_extractors, extractors): type_processor_class = FieldTypeManager().type_processor_class if isinstance(template_extractors, dict): template_extractors = template_extractors.items() attribute_map = descriptor.attribute_map for field_name, field_extractors in template_extractors: equeue = [] for eid in field_extractors: e_doc = extractors.get(eid, {}) if "regular_expression" in e_doc: equeue.append( create_regex_extractor(e_doc["regular_expression"])) elif "type_extractor" in e_doc: # overrides default one try: display_name = attribute_map[field_name].description except KeyError: display_name = field_name field_type = type_processor_class(e_doc["type_extractor"])() attribute_map[field_name] = SlybotFieldDescriptor( field_name, display_name, field_type) if field_name not in attribute_map: # if not defined type extractor, use text type by default, as it is # by far the most commonly used attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, type_processor_class("text")()) if equeue: equeue.insert(0, attribute_map[field_name].extractor) attribute_map[field_name].extractor = PipelineExtractor(*equeue)
def apply_extractors(descriptor, template_extractors_ids, extractors): field_type_manager = FieldTypeManager() template_extractors = [extractors[eid] for eid in template_extractors_ids] for field_name, field_extractors in groupby(template_extractors or (), lambda x: x["field_name"]): equeue = [] for extractor_doc in field_extractors: if "regular_expression" in extractor_doc: equeue.append(create_regex_extractor(extractor_doc["regular_expression"])) elif "type_extractor" in extractor_doc: # overrides default one descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, field_name, field_type_manager.type_processor_class(extractor_doc["type_extractor"])()) if not field_name in descriptor.attribute_map: # if not defined type extractor, use text type by default, as it is by far the most commonly used descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, field_name, field_type_manager.type_processor_class("text")()) if equeue: equeue.insert(0, descriptor.attribute_map[field_name].extractor) descriptor.attribute_map[field_name].extractor = PipelineExtractor(*equeue)
class IblItem(cls, Item): ftm = FieldTypeManager() fields = defaultdict(dict) version_fields = [] _display_name = schema.get('name') for _name, _meta in schema.get('fields', {}).items(): name = _meta.get('name', _name) serializer = ftm.type_processor_serializer(_meta.get('type')) if serializer: _meta['serializer'] = serializer fields[name] = Field(_meta) if not _meta.get("vary", False): version_fields.append(name) version_fields = sorted(version_fields)
from storage.backends import ContentFile from .base import Model from .decorators import pre_load, post_dump from .exceptions import PathResolutionError from .fields import ( Boolean, Domain, Integer, List, Regexp, String, Url, DependantField, BelongsTo, HasMany, HasOne, CASCADE, CLEAR, PROTECT, StartUrl) from .snapshots import ModelSnapshots from .utils import unwrap_envelopes, short_guid, wrap_envelopes, strip_json from .validators import OneOf _CLEAN_ANNOTATED_HTML = re.compile('( data-scrapy-[a-z]+="[^"]+")|' '( data-tagid="\d+")') _ID_RE = re.compile('-'.join(['[a-f0-9]{4}'] * 3), re.I) FIELD_TYPES = FieldTypeManager().available_type_names() class Project(Model): # TODO: override storage for hosted version, return generated project.json id = String(primary_key=True) name = String() spiders = HasMany('Spider', related_name='project', on_delete=CLEAR, ignore_in_file=True) schemas = HasMany('Schema', related_name='project', on_delete=CLEAR, ignore_in_file=True) extractors = HasMany('Extractor', related_name='project', on_delete=CLEAR, ignore_in_file=True) class Meta: path = u'project.json'
RecordExtractor, BasicTypeExtractor, TraceExtractor, TemplatePageExtractor, RepeatedDataExtractor, AdjacentVariantExtractor, TextRegionDataExtractor, labelled_element, _compose) from scrapely.extraction.similarity import (similar_region, longest_unique_subsequence, first_longest_subsequence) from scrapely.htmlpage import HtmlTagType, HtmlPageParsedRegion, HtmlPageRegion from scrapy.utils.spider import arg_to_iter from slybot.fieldtypes import FieldTypeManager from slybot.item import SlybotFieldDescriptor MAX_SEARCH_DISTANCE_MULTIPLIER = 3 MIN_TOKEN_LENGTH_BEFORE_TRUNCATE = 3 MIN_JUMP_DISTANCE = 0.7 MAX_RELATIVE_SEPARATOR_MULTIPLIER = 0.7 _DEFAULT_EXTRACTOR = FieldTypeManager().type_processor_class('text')() Region = namedtuple('Region', ['score', 'start_index', 'end_index']) container_id = lambda x: x.annotation.metadata.get('container_id') class MissingRequiredError(Exception): pass def group_tree(tree, container_annotations): result = {} get_first = itemgetter(0) for name, value in groupby(sorted(tree, key=get_first), get_first): value = list(value) if len(value) == 1: result[name] = container_annotations.get(name)
""" def _exec(x): ret = g(x) if ret is not None: ret = HtmlPageRegion(ret.htmlpage, remove_tags(ret.text_content)) return f(ret) return None return _exec MAX_SEARCH_DISTANCE_MULTIPLIER = 3 MIN_TOKEN_LENGTH_BEFORE_TRUNCATE = 3 MIN_JUMP_DISTANCE = 0.7 MAX_RELATIVE_SEPARATOR_MULTIPLIER = 0.7 _DEFAULT_EXTRACTOR = FieldTypeManager().type_processor_class('raw html')() Region = namedtuple('Region', ['score', 'start_index', 'end_index']) container_id = lambda x: x.annotation.metadata.get('container_id') def _int_cmp(a, op, b): op = getattr(operator, op) a = -float('inf') if a is None else a b = -float('inf') if b is None else b return op(a, b) class MissingRequiredError(Exception): pass