示例#1
0
 def __init__(self, name, description, field_type_processor, required=False):
     """Create a new SlybotFieldDescriptor with the given name and description. 
     The field_type_processor is used for extraction and is publicly available
     """
     FieldDescriptor.__init__(self, name, description, field_type_processor.extract, required)
     # add an adapt method
     self.adapt = field_type_processor.adapt
示例#2
0
 def __init__(self, name, description, field_type_processor, required=False):
     """Create a new SlybotFieldDescriptor with the given name and description. 
     The field_type_processor is used for extraction and is publicly available
     """
     FieldDescriptor.__init__(self, name, description, 
         field_type_processor.extract, required)
     # add an adapt method
     self.adapt = field_type_processor.adapt
示例#3
0
    def __init__(self, fields=None):
        attributes = []
        if not fields:
            fields = ['name', 'price', 'sku', 'brand', 'category']
        for x in fields:
            attributes.append(FieldDescriptor(x, x))

        attributes.append(
            FieldDescriptor('image', 'image', extractor=image_url))
        super(MyItemDescriptor, self).__init__('myitem', 'testing description',
                                               attributes)
示例#4
0
def get_extractor(site_id):
    sds = ScraperDescriptor.objects.filter(site__id=site_id)
    if not sds.exists():
        return

    tmpls = []

    for s in sds:
        items = s.items.filter(descriptor__target__symbol='ProductInfo')
        idesc = ItemDescriptor('', '', [
            FieldDescriptor(i.descriptor.symbol,
                            i.descriptor.desc,
                            extractor=types[i.descriptor.typ.symbol](i.value))
            for i in items
        ])
        ts = load_templates(s.id)
        tmpls += [(t, idesc) for _, t in ts]
    if tmpls:
        ex = InstanceBasedLearningExtractor(tmpls)

        def extractor(response):
            page = HtmlPage(response.url,
                            headers=response.headers,
                            body=response.body.decode(response.encoding),
                            encoding=response.encoding)
            extract = ex.extract(page)
            if extract[0] is not None:
                for e in extract[0]:
                    yield e

        return extractor
示例#5
0
def get_visual_tool_item_descriptor(fields_spec):
    field_descriptors = []
    for field_name, spec in fields_spec.items():
        required = spec['required']
        extractor_name = spec['extractor']
        extractor_func = get_extractor_function(extractor_name)
        field_descriptors.append(
            FieldDescriptor(field_name, field_name, extractor_func, required))

    return ItemDescriptor("", "", field_descriptors)
示例#6
0
文件: scrape.py 项目: I-TREND/SASF
    url = sys.argv[1]
else:
    url = 'http://www.rc-chem.eu/'  # http://mefedronprodej.webnode.cz/.

group = Group.objects.get(id=1)
site = SiteData.objects.get(site__url=url, group=group)
sds = ScraperDescriptor.objects.filter(site=site)

tmpls = []

if sds.exists():
    for s in sds:
        items = s.items.filter(descriptor__target__symbol='ProductInfo')
        idesc = ItemDescriptor('', '', [
            FieldDescriptor(i.descriptor.symbol,
                            i.descriptor.desc,
                            extractor=types[i.descriptor.typ.symbol](i.value))
            for i in items
        ])
        ts = load_templates('scraper.json', 'scraper-%d' % s.id)
        if not ts:
            ts = annotate(s.url, 'scraper-%d' % s.id,
                          [(i.descriptor.symbol, i.value) for i in items])
        tmpls += [(t, idesc) for t in ts]
else:
    url = u'http://www.rc-chem.eu/produkty/thio-crystal'
    items = [
        (u'NAME_PROD', u'THIO', '', None),
        (u'CHEM_NAME_PROD', u'2-(methylamino)', '', None),
        (u'MIN_PRICE_PROD', u'400 CZK', '', None),
        (u'MAX_PRICE_PROD', u'25200 CZK', '', None),
示例#7
0
import copy
import pprint
import cStringIO
from itertools import groupby, izip, starmap

from numpy import array

from scrapely.descriptor import FieldDescriptor
from scrapely.htmlpage import HtmlPageRegion
from scrapely.extraction.similarity import (similar_region,
    longest_unique_subsequence, common_prefix)
from scrapely.extraction.pageobjects import (AnnotationTag,
    PageRegion, FragmentedHtmlPageRegion)

_EXTRACT_HTML = lambda x: x
_DEFAULT_DESCRIPTOR = FieldDescriptor('none', None)

__all__ = ['BasicTypeExtractor',
           'TraceExtractor',
           'RepeatedDataExtractor',
           'AdjacentVariantExtractor',
           'RecordExtractor',
           'TemplatePageExtractor',
           'TextRegionDataExtractor',
           'attrs2dict',
           'labelled_element']

def labelled_element(obj):
    """
    Returns labelled element of the object (extractor or labelled region)
    """