예제 #1
0
def apply_extractors(descriptor, template_extractors, extractors):
    field_type_manager = FieldTypeManager()

    for field_name, field_extractors in template_extractors.items():
        equeue = []
        for eid in field_extractors:
            extractor_doc = extractors[eid]
            if "regular_expression" in extractor_doc:
                equeue.append(
                    create_regex_extractor(
                        extractor_doc["regular_expression"]))
            elif "type_extractor" in extractor_doc:  # overrides default one
                descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
                    field_name, field_name,
                    field_type_manager.type_processor_class(
                        extractor_doc["type_extractor"])())
        if not field_name in descriptor.attribute_map:
            # if not defined type extractor, use text type by default, as it is by far the most commonly used
            descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
                field_name, field_name,
                field_type_manager.type_processor_class("text")())

        if equeue:
            equeue.insert(0, descriptor.attribute_map[field_name].extractor)
            descriptor.attribute_map[field_name].extractor = PipelineExtractor(
                *equeue)
예제 #2
0
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    descriptors = []
    for pname, pdict in schema['fields'].items():
        required = pdict['required']
        pclass = field_type_manager.type_processor_class(pdict['type'])
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, pname, processor, required)
        descriptors.append(descriptor)
    return ItemDescriptor("", "", descriptors)
예제 #3
0
def create_type_extractor(_type):
    types = FieldTypeManager()
    extractor = types.type_processor_class(_type)()

    def _extractor(txt, htmlpage=None):
        data = extractor.extractor(txt)
        if data:
            return extractor.adapt(data, htmlpage)
    _extractor.__name__ = ("Type Extractor: %s" % _type).encode('utf-8')
    return _extractor
예제 #4
0
파일: item.py 프로젝트: 9thSymfony/slybot
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    descriptors = []
    for pname, pdict in schema['fields'].items():
        required = pdict['required']
        pclass = field_type_manager.type_processor_class(pdict['type'])
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, pname, processor, required)
        descriptors.append(descriptor)
    return ItemDescriptor("", "", descriptors)
예제 #5
0
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    if schema is None:
        schema = {"id": "item", "properties": ()}
    descriptors = []
    for pname, pdict in schema.get("properties", ()):
        description = pdict.get("description")
        required = not pdict.get("optional", True)
        pclass = field_type_manager.type_processor_class(pdict.get("type"))
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, description, processor, required)
        descriptors.append(descriptor)
    return ItemDescriptor(schema["id"], schema.get("description"), descriptors)
예제 #6
0
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    if schema is None:
        schema = {'id': 'item', 'properties': ()}
    descriptors = []
    for pname, pdict in schema.get('properties', ()):
        description = pdict.get('description')
        required = not pdict.get('optional', True)
        pclass = field_type_manager.type_processor_class(pdict.get('type'))
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, description, processor, required)
        descriptors.append(descriptor)
    return ItemDescriptor(schema['id'], schema.get('description'), descriptors)
예제 #7
0
def create_slybot_item_descriptor(schema, schema_name=""):
    field_type_manager = FieldTypeManager()
    descriptors = []
    for pname, pdict in schema.get('fields', {}).items():
        required = pdict['required']
        pdisplay_name = pdict.get('name', pname)
        pclass = field_type_manager.type_processor_class(pdict['type'])
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, pdisplay_name, processor,
                                           required)
        descriptors.append(descriptor)
    return SlybotItemDescriptor(schema_name, schema.get('name', schema_name),
                                descriptors)
예제 #8
0
def apply_extractors(descriptor, template_extractors, all_extractors):
    field_type_manager = FieldTypeManager()
    for eid in template_extractors or ():
        extractor_doc = all_extractors[eid]
        field_name = extractor_doc["field_name"]
        if not field_name in descriptor.attribute_map:
            descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
                field_name, field_name, field_type_manager.type_processor_class("text")
            )
        if "regular_expression" in extractor_doc:
            descriptor.attribute_map[field_name].extractor = create_regex_extractor(extractor_doc["regular_expression"])
        else:
            descriptor.attribute_map[field_name].extractor = getattr(ExtractorTypes, extractor_doc["builtin_extractor"])
예제 #9
0
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    if schema is None:
        schema = {'id': 'item', 'properties': ()}
    descriptors = []
    for pname, pdict in schema.get('properties', ()):
        description = pdict.get('description')
        required = not pdict.get('optional', True)
        pclass = field_type_manager.type_processor_class(pdict.get('type'))
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, description, processor,
                                           required)
        descriptors.append(descriptor)
    return ItemDescriptor(schema['id'], schema.get('description'), descriptors)
예제 #10
0
파일: item.py 프로젝트: FFFFFurry/portia
def create_slybot_item_descriptor(schema, schema_name=""):
    field_type_manager = FieldTypeManager()
    descriptors = []
    for pname, pdict in schema['fields'].items():
        required = pdict['required']
        pdisplay_name = pdict.get('name', pname)
        pclass = field_type_manager.type_processor_class(pdict['type'])
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, pdisplay_name, processor,
                                           required)
        descriptors.append(descriptor)
    return SlybotItemDescriptor(schema_name,
                                schema.get('name', schema_name),
                                descriptors)
예제 #11
0
def create_type_extractor(_type):
    types = FieldTypeManager()
    extractor = types.type_processor_class(_type)()

    def _extractor(txt, htmlpage=None):
        if txt is None:
            return
        page = getattr(htmlpage, 'htmlpage', htmlpage)
        if not hasattr(txt, 'text_content'):
            txt = HtmlPageRegion(page, txt)
        data = extractor.extract(txt)
        if data:
            return extractor.adapt(data, page)
    _extractor.__name__ = ("Type Extractor: %s" % _type)
    return _extractor
예제 #12
0
def create_type_extractor(_type):
    types = FieldTypeManager()
    extractor = types.type_processor_class(_type)()

    def _extractor(txt, htmlpage=None):
        if txt is None:
            return
        page = getattr(htmlpage, 'htmlpage', htmlpage)
        if not hasattr(txt, 'text_content'):
            txt = HtmlPageRegion(page, txt)
        data = extractor.extract(txt)
        if data:
            return extractor.adapt(data, page)

    _extractor.__name__ = ("Type Extractor: %s" % _type).encode('utf-8')
    return _extractor
예제 #13
0
def apply_extractors(descriptor, template_extractors, extractors):
    type_processor_class = FieldTypeManager().type_processor_class
    if isinstance(template_extractors, dict):
        template_extractors = template_extractors.items()
    attribute_map = descriptor.attribute_map
    for field_name, field_extractors in template_extractors:
        equeue = []
        for eid in field_extractors:
            e_doc = extractors.get(eid, {})
            if "regular_expression" in e_doc:
                equeue.append(
                    create_regex_extractor(e_doc["regular_expression"]))
            elif "type_extractor" in e_doc:  # overrides default one
                try:
                    display_name = attribute_map[field_name].description
                except KeyError:
                    display_name = field_name
                field_type = type_processor_class(e_doc["type_extractor"])()
                attribute_map[field_name] = SlybotFieldDescriptor(
                    field_name, display_name, field_type)
        if field_name not in attribute_map:
            # if not defined type extractor, use text type by default, as it is
            # by far the most commonly used
            attribute_map[field_name] = SlybotFieldDescriptor(
                field_name, field_name,
                type_processor_class("text")())

        if equeue:
            equeue.insert(0, attribute_map[field_name].extractor)
            attribute_map[field_name].extractor = PipelineExtractor(*equeue)
예제 #14
0
def apply_extractors(descriptor, template_extractors_ids, extractors):
    field_type_manager = FieldTypeManager()
    template_extractors = [extractors[eid] for eid in template_extractors_ids]
    for field_name, field_extractors in groupby(template_extractors or (), lambda x: x["field_name"]):
        equeue = []
        for extractor_doc in field_extractors:
            if "regular_expression" in extractor_doc:
                equeue.append(create_regex_extractor(extractor_doc["regular_expression"]))
            elif "type_extractor" in extractor_doc: # overrides default one
                descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, 
                    field_name, field_type_manager.type_processor_class(extractor_doc["type_extractor"])())
        if not field_name in descriptor.attribute_map:
            # if not defined type extractor, use text type by default, as it is by far the most commonly used
            descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, 
                    field_name, field_type_manager.type_processor_class("text")())
            
        if equeue:
            equeue.insert(0, descriptor.attribute_map[field_name].extractor)
            descriptor.attribute_map[field_name].extractor = PipelineExtractor(*equeue)
예제 #15
0
 class IblItem(cls, Item):
     ftm = FieldTypeManager()
     fields = defaultdict(dict)
     version_fields = []
     _display_name = schema.get('name')
     for _name, _meta in schema.get('fields', {}).items():
         name = _meta.get('name', _name)
         serializer = ftm.type_processor_serializer(_meta.get('type'))
         if serializer:
             _meta['serializer'] = serializer
         fields[name] = Field(_meta)
         if not _meta.get("vary", False):
             version_fields.append(name)
     version_fields = sorted(version_fields)
예제 #16
0
from storage.backends import ContentFile

from .base import Model
from .decorators import pre_load, post_dump
from .exceptions import PathResolutionError
from .fields import (
    Boolean, Domain, Integer, List, Regexp, String, Url, DependantField,
    BelongsTo, HasMany, HasOne, CASCADE, CLEAR, PROTECT, StartUrl)
from .snapshots import ModelSnapshots
from .utils import unwrap_envelopes, short_guid, wrap_envelopes, strip_json
from .validators import OneOf

_CLEAN_ANNOTATED_HTML = re.compile('( data-scrapy-[a-z]+="[^"]+")|'
                                   '( data-tagid="\d+")')
_ID_RE = re.compile('-'.join(['[a-f0-9]{4}'] * 3), re.I)
FIELD_TYPES = FieldTypeManager().available_type_names()


class Project(Model):
    # TODO: override storage for hosted version, return generated project.json
    id = String(primary_key=True)
    name = String()
    spiders = HasMany('Spider', related_name='project', on_delete=CLEAR,
                      ignore_in_file=True)
    schemas = HasMany('Schema', related_name='project', on_delete=CLEAR,
                      ignore_in_file=True)
    extractors = HasMany('Extractor', related_name='project', on_delete=CLEAR,
                         ignore_in_file=True)

    class Meta:
        path = u'project.json'
예제 #17
0
    RecordExtractor, BasicTypeExtractor, TraceExtractor, TemplatePageExtractor,
    RepeatedDataExtractor, AdjacentVariantExtractor, TextRegionDataExtractor,
    labelled_element, _compose)
from scrapely.extraction.similarity import (similar_region,
                                            longest_unique_subsequence,
                                            first_longest_subsequence)
from scrapely.htmlpage import HtmlTagType, HtmlPageParsedRegion, HtmlPageRegion
from scrapy.utils.spider import arg_to_iter
from slybot.fieldtypes import FieldTypeManager
from slybot.item import SlybotFieldDescriptor

MAX_SEARCH_DISTANCE_MULTIPLIER = 3
MIN_TOKEN_LENGTH_BEFORE_TRUNCATE = 3
MIN_JUMP_DISTANCE = 0.7
MAX_RELATIVE_SEPARATOR_MULTIPLIER = 0.7
_DEFAULT_EXTRACTOR = FieldTypeManager().type_processor_class('text')()
Region = namedtuple('Region', ['score', 'start_index', 'end_index'])
container_id = lambda x: x.annotation.metadata.get('container_id')


class MissingRequiredError(Exception):
    pass


def group_tree(tree, container_annotations):
    result = {}
    get_first = itemgetter(0)
    for name, value in groupby(sorted(tree, key=get_first), get_first):
        value = list(value)
        if len(value) == 1:
            result[name] = container_annotations.get(name)
예제 #18
0
    """
    def _exec(x):
        ret = g(x)
        if ret is not None:
            ret = HtmlPageRegion(ret.htmlpage, remove_tags(ret.text_content))
            return f(ret)
        return None

    return _exec


MAX_SEARCH_DISTANCE_MULTIPLIER = 3
MIN_TOKEN_LENGTH_BEFORE_TRUNCATE = 3
MIN_JUMP_DISTANCE = 0.7
MAX_RELATIVE_SEPARATOR_MULTIPLIER = 0.7
_DEFAULT_EXTRACTOR = FieldTypeManager().type_processor_class('raw html')()
Region = namedtuple('Region', ['score', 'start_index', 'end_index'])
container_id = lambda x: x.annotation.metadata.get('container_id')


def _int_cmp(a, op, b):
    op = getattr(operator, op)
    a = -float('inf') if a is None else a
    b = -float('inf') if b is None else b
    return op(a, b)


class MissingRequiredError(Exception):
    pass