예제 #1
0
def apply_extractors(descriptor, template_extractors, extractors):
    field_type_manager = FieldTypeManager()

    for field_name, field_extractors in template_extractors.items():
        equeue = []
        for eid in field_extractors:
            extractor_doc = extractors[eid]
            if "regular_expression" in extractor_doc:
                equeue.append(
                    create_regex_extractor(
                        extractor_doc["regular_expression"]))
            elif "type_extractor" in extractor_doc:  # overrides default one
                descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
                    field_name, field_name,
                    field_type_manager.type_processor_class(
                        extractor_doc["type_extractor"])())
        if not field_name in descriptor.attribute_map:
            # if not defined type extractor, use text type by default, as it is by far the most commonly used
            descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
                field_name, field_name,
                field_type_manager.type_processor_class("text")())

        if equeue:
            equeue.insert(0, descriptor.attribute_map[field_name].extractor)
            descriptor.attribute_map[field_name].extractor = PipelineExtractor(
                *equeue)
예제 #2
0
파일: item.py 프로젝트: 9thSymfony/slybot
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    descriptors = []
    for pname, pdict in schema['fields'].items():
        required = pdict['required']
        pclass = field_type_manager.type_processor_class(pdict['type'])
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, pname, processor, required)
        descriptors.append(descriptor)
    return ItemDescriptor("", "", descriptors)
예제 #3
0
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    descriptors = []
    for pname, pdict in schema['fields'].items():
        required = pdict['required']
        pclass = field_type_manager.type_processor_class(pdict['type'])
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, pname, processor, required)
        descriptors.append(descriptor)
    return ItemDescriptor("", "", descriptors)
예제 #4
0
def apply_extractors(descriptor, template_extractors_ids, extractors):
    field_type_manager = FieldTypeManager()
    template_extractors = [extractors[eid] for eid in template_extractors_ids]
    for field_name, field_extractors in groupby(template_extractors or (), lambda x: x["field_name"]):
        equeue = []
        for extractor_doc in field_extractors:
            if "regular_expression" in extractor_doc:
                equeue.append(create_regex_extractor(extractor_doc["regular_expression"]))
            elif "type_extractor" in extractor_doc: # overrides default one
                descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, 
                    field_name, field_type_manager.type_processor_class(extractor_doc["type_extractor"])())
        if not field_name in descriptor.attribute_map:
            # if not defined type extractor, use text type by default, as it is by far the most commonly used
            descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, 
                    field_name, field_type_manager.type_processor_class("text")())
            
        if equeue:
            equeue.insert(0, descriptor.attribute_map[field_name].extractor)
            descriptor.attribute_map[field_name].extractor = PipelineExtractor(*equeue)
예제 #5
0
def create_type_extractor(_type):
    types = FieldTypeManager()
    extractor = types.type_processor_class(_type)()

    def _extractor(txt, htmlpage=None):
        data = extractor.extractor(txt)
        if data:
            return extractor.adapt(data, htmlpage)
    _extractor.__name__ = ("Type Extractor: %s" % _type).encode('utf-8')
    return _extractor
예제 #6
0
def create_slybot_item_descriptor(schema, schema_name=""):
    field_type_manager = FieldTypeManager()
    descriptors = []
    for pname, pdict in schema.get('fields', {}).items():
        required = pdict['required']
        pdisplay_name = pdict.get('name', pname)
        pclass = field_type_manager.type_processor_class(pdict['type'])
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, pdisplay_name, processor,
                                           required)
        descriptors.append(descriptor)
    return SlybotItemDescriptor(schema_name, schema.get('name', schema_name),
                                descriptors)
예제 #7
0
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    if schema is None:
        schema = {'id': 'item', 'properties': ()}
    descriptors = []
    for pname, pdict in schema.get('properties', ()):
        description = pdict.get('description')
        required = not pdict.get('optional', True)
        pclass = field_type_manager.type_processor_class(pdict.get('type'))
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, description, processor, required)
        descriptors.append(descriptor)
    return ItemDescriptor(schema['id'], schema.get('description'), descriptors)
예제 #8
0
def apply_extractors(descriptor, template_extractors, all_extractors):
    field_type_manager = FieldTypeManager()
    for eid in template_extractors or ():
        extractor_doc = all_extractors[eid]
        field_name = extractor_doc["field_name"]
        if not field_name in descriptor.attribute_map:
            descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
                field_name, field_name, field_type_manager.type_processor_class("text")
            )
        if "regular_expression" in extractor_doc:
            descriptor.attribute_map[field_name].extractor = create_regex_extractor(extractor_doc["regular_expression"])
        else:
            descriptor.attribute_map[field_name].extractor = getattr(ExtractorTypes, extractor_doc["builtin_extractor"])
예제 #9
0
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    if schema is None:
        schema = {"id": "item", "properties": ()}
    descriptors = []
    for pname, pdict in schema.get("properties", ()):
        description = pdict.get("description")
        required = not pdict.get("optional", True)
        pclass = field_type_manager.type_processor_class(pdict.get("type"))
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, description, processor, required)
        descriptors.append(descriptor)
    return ItemDescriptor(schema["id"], schema.get("description"), descriptors)
예제 #10
0
파일: item.py 프로젝트: FFFFFurry/portia
def create_slybot_item_descriptor(schema, schema_name=""):
    field_type_manager = FieldTypeManager()
    descriptors = []
    for pname, pdict in schema['fields'].items():
        required = pdict['required']
        pdisplay_name = pdict.get('name', pname)
        pclass = field_type_manager.type_processor_class(pdict['type'])
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, pdisplay_name, processor,
                                           required)
        descriptors.append(descriptor)
    return SlybotItemDescriptor(schema_name,
                                schema.get('name', schema_name),
                                descriptors)
예제 #11
0
def create_slybot_item_descriptor(schema):
    field_type_manager = FieldTypeManager()
    if schema is None:
        schema = {'id': 'item', 'properties': ()}
    descriptors = []
    for pname, pdict in schema.get('properties', ()):
        description = pdict.get('description')
        required = not pdict.get('optional', True)
        pclass = field_type_manager.type_processor_class(pdict.get('type'))
        processor = pclass()
        descriptor = SlybotFieldDescriptor(pname, description, processor,
                                           required)
        descriptors.append(descriptor)
    return ItemDescriptor(schema['id'], schema.get('description'), descriptors)
예제 #12
0
def create_type_extractor(_type):
    types = FieldTypeManager()
    extractor = types.type_processor_class(_type)()

    def _extractor(txt, htmlpage=None):
        if txt is None:
            return
        page = getattr(htmlpage, 'htmlpage', htmlpage)
        if not hasattr(txt, 'text_content'):
            txt = HtmlPageRegion(page, txt)
        data = extractor.extract(txt)
        if data:
            return extractor.adapt(data, page)
    _extractor.__name__ = ("Type Extractor: %s" % _type)
    return _extractor
예제 #13
0
def create_type_extractor(_type):
    types = FieldTypeManager()
    extractor = types.type_processor_class(_type)()

    def _extractor(txt, htmlpage=None):
        if txt is None:
            return
        page = getattr(htmlpage, 'htmlpage', htmlpage)
        if not hasattr(txt, 'text_content'):
            txt = HtmlPageRegion(page, txt)
        data = extractor.extract(txt)
        if data:
            return extractor.adapt(data, page)

    _extractor.__name__ = ("Type Extractor: %s" % _type).encode('utf-8')
    return _extractor