def __init__(self, name, description, field_type_processor, required=False): """Create a new SlybotFieldDescriptor with the given name and description. The field_type_processor is used for extraction and is publicly available """ FieldDescriptor.__init__(self, name, description, field_type_processor.extract, required) # add an adapt method self.adapt = field_type_processor.adapt
def __init__(self, fields=None): attributes = [] if not fields: fields = ['name', 'price', 'sku', 'brand', 'category'] for x in fields: attributes.append(FieldDescriptor(x, x)) attributes.append( FieldDescriptor('image', 'image', extractor=image_url)) super(MyItemDescriptor, self).__init__('myitem', 'testing description', attributes)
def get_extractor(site_id): sds = ScraperDescriptor.objects.filter(site__id=site_id) if not sds.exists(): return tmpls = [] for s in sds: items = s.items.filter(descriptor__target__symbol='ProductInfo') idesc = ItemDescriptor('', '', [ FieldDescriptor(i.descriptor.symbol, i.descriptor.desc, extractor=types[i.descriptor.typ.symbol](i.value)) for i in items ]) ts = load_templates(s.id) tmpls += [(t, idesc) for _, t in ts] if tmpls: ex = InstanceBasedLearningExtractor(tmpls) def extractor(response): page = HtmlPage(response.url, headers=response.headers, body=response.body.decode(response.encoding), encoding=response.encoding) extract = ex.extract(page) if extract[0] is not None: for e in extract[0]: yield e return extractor
def get_visual_tool_item_descriptor(fields_spec): field_descriptors = [] for field_name, spec in fields_spec.items(): required = spec['required'] extractor_name = spec['extractor'] extractor_func = get_extractor_function(extractor_name) field_descriptors.append( FieldDescriptor(field_name, field_name, extractor_func, required)) return ItemDescriptor("", "", field_descriptors)
url = sys.argv[1] else: url = 'http://www.rc-chem.eu/' # http://mefedronprodej.webnode.cz/. group = Group.objects.get(id=1) site = SiteData.objects.get(site__url=url, group=group) sds = ScraperDescriptor.objects.filter(site=site) tmpls = [] if sds.exists(): for s in sds: items = s.items.filter(descriptor__target__symbol='ProductInfo') idesc = ItemDescriptor('', '', [ FieldDescriptor(i.descriptor.symbol, i.descriptor.desc, extractor=types[i.descriptor.typ.symbol](i.value)) for i in items ]) ts = load_templates('scraper.json', 'scraper-%d' % s.id) if not ts: ts = annotate(s.url, 'scraper-%d' % s.id, [(i.descriptor.symbol, i.value) for i in items]) tmpls += [(t, idesc) for t in ts] else: url = u'http://www.rc-chem.eu/produkty/thio-crystal' items = [ (u'NAME_PROD', u'THIO', '', None), (u'CHEM_NAME_PROD', u'2-(methylamino)', '', None), (u'MIN_PRICE_PROD', u'400 CZK', '', None), (u'MAX_PRICE_PROD', u'25200 CZK', '', None),
import copy import pprint import cStringIO from itertools import groupby, izip, starmap from numpy import array from scrapely.descriptor import FieldDescriptor from scrapely.htmlpage import HtmlPageRegion from scrapely.extraction.similarity import (similar_region, longest_unique_subsequence, common_prefix) from scrapely.extraction.pageobjects import (AnnotationTag, PageRegion, FragmentedHtmlPageRegion) _EXTRACT_HTML = lambda x: x _DEFAULT_DESCRIPTOR = FieldDescriptor('none', None) __all__ = ['BasicTypeExtractor', 'TraceExtractor', 'RepeatedDataExtractor', 'AdjacentVariantExtractor', 'RecordExtractor', 'TemplatePageExtractor', 'TextRegionDataExtractor', 'attrs2dict', 'labelled_element'] def labelled_element(obj): """ Returns labelled element of the object (extractor or labelled region) """