def scrape_page2(self, page, fields_spec): if self._ex is None: self._ex = InstanceBasedLearningExtractor( ((t, get_visual_tool_item_descriptor(fields_spec)) for t in self._templates), False, True) res = self._ex.extract(page)[0] return res
def test_extractor_w_empty_string_extraction(self): schema = { "id": "test", "properties": [ ('gender', { 'description': '', 'optional': True, 'type': 'text', 'vary': False, }), ('name', { 'description': '', 'optional': False, 'type': 'text', 'vary': False, }), ], } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "_id": 2, "field_name": "gender", "regular_expression": "([0-9]+)" } } apply_extractors(descriptor, [1], extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
def test_extractor_w_empty_string_extraction(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "([0-9]+)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
def scrape(self, url=None, html=None, encoding='utf-8'): ## not version from https://github.com/scrapy/scrapely/blob/master/scrapely/extraction/pageparsing.py ## may need to replace with version from inspect.getsourcelines(Scraper.scrape), as this version is page = self._get_page(url, encoding, html) ex = InstanceBasedLearningExtractor(self.templates) return ex.extract(page)[0]
def _run_extraction(self, name, templates, page, extractors, expected_output): self.trace = None template_pages = [HtmlPage(None, {}, t) for t in templates] extractor = InstanceBasedLearningExtractor(template_pages, extractors, True) actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) if not actual_output: if expected_output is None: return assert False, "failed to extract data for test '%s'" % name actual_output = actual_output[0] self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace', []) expected_names = set(expected_output.keys()) actual_names = set(actual_output.keys()) missing_in_output = filter(None, expected_names - actual_names) error = "attributes '%s' were expected but were not present in test '%s'" % \ ("', '".join(missing_in_output), name) assert len(missing_in_output) == 0, error unexpected = actual_names - expected_names error = "unexpected attributes %s in test '%s'" % \ (', '.join(unexpected), name) assert len(unexpected) == 0, error for k, v in expected_output.items(): extracted = actual_output[k] assert v == extracted, "in test '%s' for attribute '%s', " \ "expected value '%s' but got '%s'" % (name, k, v, extracted)
def test_annotate_multiple(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match('text to annotate'), best_match=False) tpl = tm.get_template() ex = InstanceBasedLearningExtractor([(tpl, None)]) self.assertEqual(ex.extract(self.PAGE)[0], [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
def test_type_extractor(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "type_extractor": "text" }, 2: { "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, {"gender": [1, 2]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def test_annotate_ignore_unpaired(self): tm = TemplateMaker(self.PAGE) tm.annotate('field1', best_match("and that's"), best_match=False) tpl = tm.get_template() ex = InstanceBasedLearningExtractor([(tpl, None)]) self.assertEqual(ex.extract(self.PAGE)[0], [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
def test_extraction(self, name, templates, page, descriptor, expected_output): template_pages = [HtmlPage(None, {}, t) for t in templates] extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) self.assertEqual(expected_output, actual_output and actual_output[0])
def test_type_extractor(self): schema = { "id": "test", "properties": [('gender', { 'description': '', 'optional': True, 'type': 'number', 'vary': False, })], } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "_id": 1, "field_name": "gender", "type_extractor": "text" }, 2: { "_id": 2, "field_name": "gender", "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, [1, 2], extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def do_s(self, url): """s <url> - scrape url (uses encoding from templates)""" templates = self._load_templates() if assert_or_print(templates, "no templates available"): return page = get_page(url, templates[0].encoding) ex = InstanceBasedLearningExtractor(templates) pprint.pprint(ex.extract(page)[0])
def test_annotate_multiple(self): tm = TemplateMaker(self.PAGE) tm.annotate("field1", best_match("text to annotate"), best_match=False) tpl = tm.get_template() ex = InstanceBasedLearningExtractor([tpl]) self.assertEqual( ex.extract(self.PAGE)[0], [{u"field1": [u"Some text to annotate here", u"Another text to annotate there"]}] )
def do_s(self, url): """s <url> - scrape url""" templates = self._load_templates() if assert_or_print(templates, "no templates available"): return # fall back to the template encoding if none is specified page = url_to_page(url, default_encoding=templates[0].encoding) ex = InstanceBasedLearningExtractor((t, None) for t in templates) pprint.pprint(ex.extract(page)[0])
def do_scrape(self, url): """scrape <url> - scrape url (alias: s)""" templates = self._load_templates() if assert_or_print(templates, "no templates available"): return # fall back to the template encoding if none is specified page = url_to_page(url, default_encoding=templates[0].encoding) ex = InstanceBasedLearningExtractor((t, None) for t in templates) pprint.pprint(ex.extract(page)[0])
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']: val = val.splitlines() spec[key] = val self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None
def test_default_type_extractor(self): schema = {'fields': {}} descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def do_s(self, line): """s <url> [--encoding ENCODING --useragent 'User-Agent'] - scrape url""" templates = self._load_templates() if assert_or_print(templates, "no templates available"): return opts, (url,) = parse_at_s(line) headers = { 'User-Agent' : opts.useragent or self.user_agent } url = urllib2.Request(url, headers=headers) # fall back to the template encoding if none is specified page = url_to_page(url, opts.encoding, templates[0].encoding) ex = InstanceBasedLearningExtractor((t, None) for t in templates) pprint.pprint(ex.extract(page)[0])
def test_default_type_extractor(self): schema = { 'fields': {} } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(([ t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', []) ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=lambda pair: pair[0]) self.itemcls_info = {} if settings.get('AUTO_PAGINATION'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)): page_extractors_pairs = map(itemgetter(1, 2), triplets) schema = items[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def get_extractor(site_id): sds = ScraperDescriptor.objects.filter(site__id=site_id) if not sds.exists(): return tmpls = [] for s in sds: items = s.items.filter(descriptor__target__symbol='ProductInfo') idesc = ItemDescriptor('', '', [ FieldDescriptor(i.descriptor.symbol, i.descriptor.desc, extractor=types[i.descriptor.typ.symbol](i.value)) for i in items ]) ts = load_templates(s.id) tmpls += [(t, idesc) for _, t in ts] if tmpls: ex = InstanceBasedLearningExtractor(tmpls) def extractor(response): page = HtmlPage(response.url, headers=response.headers, body=response.body.decode(response.encoding), encoding=response.encoding) extract = ex.extract(page) if extract[0] is not None: for e in extract[0]: yield e return extractor
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(( [t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0')] for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = {template.get('page_id'): template['scrapes'] for template in spec.get('templates')} self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor(schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby(sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor( [(page, scrapes['#default']) for page, scrapes, version in group])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def test_default_type_extractor(self): schema = { "id": "test", "properties": [], } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "_id": 1, "field_name": "gender", "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, [1], extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def test_negative_hit_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0], None)
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec['scrapes'] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, \ log.WARNING) self._item_template_pages = sorted(( [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self._fpages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type', 'item') == 'form' ] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor(self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, }
def test_text_type_w_regex(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: { "regular_expression": "Gender\\s+(Male|Female)" }} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def test_raw_type_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'raw', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: { "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)" }} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
def test_text_type_w_regex_and_no_groups(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Gender']})
class ScraperXPath(Scraper): """ This extractor considers 'xpath_annotations' when training """ default_data_attrs = {} def _train_implementation(self, htmlpage, data, data_attrs=None, repeated=False): assert data, "Cannot train with empty data" if not repeated: best_match = True else: best_match = False if data_attrs is None: data_attrs = self.default_data_attrs # assume that `data` has xpathes for each field and annotate it with # 'xpath_annotation' html attributes htmlpage.body = prepare_html(htmlpage.body, data) # train using xpath annotations tm = TemplateMakerWithAttrs(htmlpage) for field, values in data.items(): if not hasattr(values, '__iter__'): values = [values] if field in data_attrs: attr = data_attrs[field] else: attr = None if len(list(values)) > 0: tm.annotate(field, best_match_xpath_annotation(field), attr=attr, best_match=best_match) template = tm.get_template() # remove xpath annotations from resulting html page, # cause they will interfere with IBL Extractor template = filter_annotation_from_template(template) self.add_template(template) def train_from_htmlpage(self, htmlpage, data, data_attrs=None): self._train_implementation(htmlpage, data, data_attrs, repeated=False) def train_from_htmlpage_repeated(self, htmlpage, data, data_attrs=None): self._train_implementation(htmlpage, data, data_attrs, repeated=True) def scrape_page2(self, page, fields_spec): if self._ex is None: self._ex = InstanceBasedLearningExtractor( ((t, get_visual_tool_item_descriptor(fields_spec)) for t in self._templates), False, True) res = self._ex.extract(page)[0] return res
def test_raw_type_w_regex(self): schema = { "id": "test", "properties": [('gender', { 'description': '', 'optional': True, 'type': 'raw', 'vary': False, })], } descriptor = create_slybot_item_descriptor(schema) extractors = {1: { "_id": 1, "field_name": "gender", "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)" }} apply_extractors(descriptor, [1], extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
def _run_extraction(self, name, templates, page, descriptor, expected_output): self.trace = None template_pages = [HtmlPage(None, {}, t) for t in templates] # extracts with trace enabled in order to generate traceback extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True) actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) if actual_output is not None: actual_output = actual_output[0] self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace') # extracts again with trace disabled in order to get the pure output extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) if actual_output is None: if expected_output is None: return assert False, "failed to extract data for test '%s'" % name else: actual_output = actual_output[0] expected_names = set(expected_output.keys()) actual_names = set(actual_output.keys()) missing_in_output = filter(None, expected_names - actual_names) error = "attributes '%s' were expected but were not present in test '%s'" % \ ("', '".join(missing_in_output), name) assert len(missing_in_output) == 0, error unexpected = actual_names - expected_names error = "unexpected attributes %s in test '%s'" % \ (', '.join(unexpected), name) assert len(unexpected) == 0, error for k, v in expected_output.items(): extracted = actual_output[k] assert v == extracted, "in test '%s' for attribute '%s', " \ "expected value '%s' but got '%s'" % (name, k, v, extracted)
def test_raw_type_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'raw', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
class Scraper(object): def __init__(self, templates=None): """Initialize an empty scraper.""" self._templates = templates or [] self._ex = None @classmethod def fromfile(cls, file): """Initialize a scraper from a file previously stored by tofile() method. """ templates = [HtmlPage(**x) for x in json.load(file)['templates']] return cls(templates) def tofile(self, file): """Store the scraper into the given file-like object""" tpls = [page_to_dict(x) for x in self._templates] json.dump({'templates': tpls}, file) def add_template(self, template): self._templates.append(template) self._ex = None def train_from_htmlpage(self, htmlpage, data): assert data, "Cannot train with empty data" tm = TemplateMaker(htmlpage) for field, values in data.items(): if (isinstance(values, (bytes, str)) or not hasattr(values, '__iter__')): values = [values] for value in values: value = str_to_unicode(value, htmlpage.encoding) tm.annotate(field, best_match(value)) self.add_template(tm.get_template()) def train(self, url, data, encoding=None): page = url_to_page(url, encoding) self.train_from_htmlpage(page, data) def scrape(self, url, encoding=None): page = url_to_page(url, encoding) return self.scrape_page(page) def scrape_page(self, page): if self._ex is None: self._ex = InstanceBasedLearningExtractor((t, None) for t in self._templates) return self._ex.extract(page)[0]
class Scraper(object): def __init__(self, templates=None): """Initialize an empty scraper.""" self._templates = templates or [] self._ex = None @classmethod def fromfile(cls, file): """Initialize a scraper from a file previously stored by tofile() method. """ templates = [HtmlPage(**x) for x in json.load(file)['templates']] return cls(templates) def tofile(self, file): """Store the scraper into the given file-like object""" tpls = [page_to_dict(x) for x in self._templates] json.dump({'templates': tpls}, file) def add_template(self, template): self._templates.append(template) self._ex = None def train_from_htmlpage(self, htmlpage, data): assert data, "Cannot train with empty data" tm = TemplateMaker(htmlpage) for field, values in data.items(): if not hasattr(values, '__iter__'): values = [values] for value in values: if isinstance(value, str): value = value.decode(htmlpage.encoding or 'utf-8') tm.annotate(field, best_match(value)) self.add_template(tm.get_template()) def train(self, url, data, encoding=None): page = url_to_page(url, encoding) self.train_from_htmlpage(page, data) def scrape(self, url, encoding=None): page = url_to_page(url, encoding) return self.scrape_page(page) def scrape_page(self, page): if self._ex is None: self._ex = InstanceBasedLearningExtractor((t, None) for t in self._templates) return self._ex.extract(page)[0]
def test_extractor_w_empty_string_extraction(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "([0-9]+)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) self.itemcls_info = {} if settings.get('AUTO_PAGINATION'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)): page_extractors_pairs = map(itemgetter(1, 2), triplets) schema = items[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(( [t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] for t in spec['templates'] if t.get('page_type', 'item') == 'item' )) self.item_classes = {} self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] for default, template, template_extractors in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor(schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) page_descriptor_pairs.append((template, descriptors)) self.extractors = SlybotIBLExtractor(page_descriptor_pairs) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
(u'MEASURE_PROD',u'1 g', '', None), ] items = list( map( lambda (n,s,desc): (n,s,u'%s'%(desc.name,), types[desc.typ.symbol]), ((n,s,KeyValueDescriptor.objects.get(symbol = n)) for n,s,_,_ in items if KeyValueDescriptor.objects.filter(symbol = n).exists() ) ) ) idesc = ItemDescriptor('', '', [ FieldDescriptor(n, desc, extractor=fnc(s)) for n,s,desc,fnc in items ] ) ts = load_templates('scraper.json', 'site-%d'%site.id) if not ts: ts = annotate(url, 'site-%d'%site.id, [ (n,s) for n,s,_,_ in items ]) tmpls += [ (tm, idesc) for tm in ts ] ex = InstanceBasedLearningExtractor(tmpls) urls = ( 'http://www.rc-chem.eu/doprava', # should fail 'http://www.rc-chem.eu/produkty/2-fma', 'http://www.rc-chem.eu/produkty/3-fmc', 'http://www.rc-chem.eu/produkty/3-mmc-crystal', 'http://www.rc-chem.eu/produkty/4-fa-crystal', 'http://www.rc-chem.eu/produkty/dimethylone', 'http://www.rc-chem.eu/produkty/ethylphenidate', 'http://www.rc-chem.eu/produkty/mpa', 'http://www.rc-chem.eu/produkty/neb', 'http://www.rc-chem.eu/produkty/pentedrone-velky-crystal', 'http://www.rc-chem.eu/produkty/thio-crystal', 'http://www.rc-chem.eu/produkty/thio-velky-crystal', 'http://mefedronprodej.webnode.cz/produkty-1/',
def scrape_page(self, page): if self._ex is None: self._ex = InstanceBasedLearningExtractor((t, None) for t in self._templates) return self._ex.extract(page)[0]
map( lambda (n, s, desc): (n, s, u'%s' % (desc.name, ), types[desc.typ.symbol]), ((n, s, KeyValueDescriptor.objects.get(symbol=n)) for n, s, _, _ in items if KeyValueDescriptor.objects.filter(symbol=n).exists()))) idesc = ItemDescriptor('', '', [ FieldDescriptor(n, desc, extractor=fnc(s)) for n, s, desc, fnc in items ]) ts = load_templates('scraper.json', 'site-%d' % site.id) if not ts: ts = annotate(url, 'site-%d' % site.id, [(n, s) for n, s, _, _ in items]) tmpls += [(tm, idesc) for tm in ts] ex = InstanceBasedLearningExtractor(tmpls) urls = ( 'http://www.rc-chem.eu/doprava', # should fail 'http://www.rc-chem.eu/produkty/2-fma', 'http://www.rc-chem.eu/produkty/3-fmc', 'http://www.rc-chem.eu/produkty/3-mmc-crystal', 'http://www.rc-chem.eu/produkty/4-fa-crystal', 'http://www.rc-chem.eu/produkty/dimethylone', 'http://www.rc-chem.eu/produkty/ethylphenidate', 'http://www.rc-chem.eu/produkty/mpa', 'http://www.rc-chem.eu/produkty/neb', 'http://www.rc-chem.eu/produkty/pentedrone-velky-crystal', 'http://www.rc-chem.eu/produkty/thio-crystal', 'http://www.rc-chem.eu/produkty/thio-velky-crystal', 'http://mefedronprodej.webnode.cz/produkty-1/',
class IblSpider(BaseSpider): def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec['scrapes'] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, \ log.WARNING) self._item_template_pages = sorted(( [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self._fpages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type', 'item') == 'form' ] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor(self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } def _get_allowed_domains(self, templates): urls = [x.url for x in templates] urls += self.start_urls return [x[1] for x in iter_unique_scheme_netloc(urls)] def _get_form_requests(self, templates): reqs = [] # TODO: filter unique schema netlocs? for t in templates: # assume all templates are html and unicode response = HtmlResponse(t.url, encoding='utf-8', body=t.body, headers=t.headers) request = FormRequest.from_response(response, formname='SLYBOT-FORM', callback=self.parse, dont_filter=True) reqs.append(request) return reqs def _get_item_requests(self, templates): reqs = [] urls = [x.url for x in templates] for scheme, netloc in iter_unique_scheme_netloc(urls): r = Request("%s://%s/" % (scheme, netloc), callback=self.parse, \ dont_filter=True) reqs.append(r) return reqs def _requests_to_follow(self, htmlpage): requests = [] if self._links_ibl_extractor is not None: extracted = self._links_ibl_extractor.extract(htmlpage)[0] if extracted: extracted_regions = extracted[0].get('_links', []) seen = set() for region in extracted_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding) for request in self._request_to_follow_from_region(htmlregion): if request.url in seen: continue seen.add(request.url) requests.append(request) else: requests = self._request_to_follow_from_region(htmlpage) return requests def _request_to_follow_from_region(self, htmlregion): requests = [] seen = set() for link in self.link_extractor.links_to_follow(htmlregion): url = link.url if self.url_filterf(link): # filter out duplicate urls, later we should handle link text if url in seen: continue seen.add(url) request = Request(url) if link.text: request.meta['link_text'] = link.text requests.append(request) return requests def start_requests(self): if self.start_urls: return [Request(r, callback=self.parse, dont_filter=True) \ for r in self.start_urls] if self._fpages: return self._get_form_requests(self._fpages) return self._get_item_requests(self._ipages) def parse(self, response): """Main handler for all downloaded responses""" if isinstance(response, HtmlResponse): return self.handle_html(response) else: content_type = response.headers.get('Content-Type') self.log("Ignoring page with content-type=%r: %s" % (content_type, \ response.url), level=log.DEBUG) def _process_link_regions(self, htmlpage, link_regions): """Process link regions if any, and generate requests""" requests_to_follow = [] if link_regions: for link_region in link_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, \ link_region, encoding=htmlpage.encoding) requests_to_follow.extend(self._requests_to_follow(htmlregion)) else: requests_to_follow = self._requests_to_follow(htmlpage) return requests_to_follow def handle_html(self, response): htmlpage = HtmlPage(response.url, response.headers, \ response.body_as_unicode(), encoding=response.encoding) items, link_regions = self.extract_items(htmlpage) requests_to_follow = self._process_link_regions(htmlpage, link_regions) return requests_to_follow + items def extract_items(self, htmlpage): """This method is also called from UI webservice to extract items""" items = [] link_regions = [] for item_cls_name, info in self.itemcls_info.iteritems(): item_descriptor = info['descriptor'] extractor = info['extractor'] extracted, _link_regions = self._do_extract_items_from( htmlpage, item_descriptor, extractor, item_cls_name, ) items.extend(extracted) link_regions.extend(_link_regions) return items, link_regions def _do_extract_items_from(self, htmlpage, item_descriptor, extractor, item_cls_name): extracted_data, template = extractor.extract(htmlpage) link_regions = [] for ddict in extracted_data or []: link_regions.extend(ddict.pop("_links", [])) processed_data = _process_extracted_data(extracted_data, item_descriptor, htmlpage) items = [] item_cls = self.itemcls_info[item_cls_name]['class'] for processed_attributes in processed_data: item = item_cls(processed_attributes) item['url'] = htmlpage.url item['_type'] = item_cls_name item['_template'] = template.id items.append(item) return items, link_regions def build_url_filter(self, spec): """make a filter for links""" respect_nofollow = spec.get('respect_nofollow', True) patterns = spec.get('follow_patterns') if spec.get("links_to_follow") == "none": url_filterf = lambda x: False elif patterns: pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % '|'.join(patterns) follow_pattern = re.compile(pattern) if respect_nofollow: url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow else: url_filterf = lambda x: follow_pattern.search(x.url) elif respect_nofollow: url_filterf = lambda x: not x.nofollow else: url_filterf = bool # apply exclude patterns exclude_patterns = spec.get('exclude_patterns') if exclude_patterns: pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % '|'.join(exclude_patterns) exclude_pattern = re.compile(pattern) self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x) else: self.url_filterf = url_filterf
class Annotations(object): """ Base Class for adding plugins to Portia Web and Slybot. """ def setup_bot(self, settings, spec, items, extractors, logger): """ Perform any initialization needed for crawling using this plugin """ self.logger = logger templates = map(self._get_annotated_template, spec['templates']) _item_template_pages = sorted(([ t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0') ] for t in templates if t.get('page_type', 'item') == 'item'), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = { template.get('page_id'): template['scrapes'] for template in templates } if (settings.get('AUTO_PAGINATION') or spec.get('links_to_follow') == 'auto'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor( schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby( sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor([ (page, scrapes['#default']) for page, scrapes, version in group ])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in templates if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) # Clustering self.template_names = [t.get('page_id') for t in spec['templates']] if settings.get('PAGE_CLUSTERING'): try: import page_clustering self.clustering = page_clustering.kmeans_from_samples( spec['templates']) self.logger.info("Clustering activated") except ImportError: self.clustering = None self.logger.warning( "Clustering could not be used because it is not installed") else: self.clustering = None def _get_annotated_template(self, template): if template.get('version', '0.12.0') >= '0.13.0': _build_sample(template) return template def handle_html(self, response, seen=None): htmlpage = htmlpage_from_response(response) items, link_regions = self.extract_items(htmlpage) htmlpage.headers['n_items'] = len(items) try: response.meta['n_items'] = len(items) except AttributeError: pass # response not tied to any request for item in items: yield item for request in self._process_link_regions(htmlpage, link_regions): yield request def extract_items(self, htmlpage): """This method is also called from UI webservice to extract items""" for extractor in self.extractors: items, links = self._do_extract_items_from(htmlpage, extractor) if items: return items, links return [], [] def _do_extract_items_from(self, htmlpage, extractor): # Try to predict template to use pref_template_id = None template_cluster = _CLUSTER_NA if self.clustering: self.clustering.add_page(htmlpage) if self.clustering.is_fit: clt = self.clustering.classify(htmlpage) if clt != -1: template_cluster = self.template_names[clt] pref_template_id = template_cluster else: template_cluster = _CLUSTER_OUTLIER extracted_data, template = extractor.extract(htmlpage, pref_template_id) link_regions = [] for ddict in extracted_data or []: link_regions.extend(ddict.pop("_links", [])) descriptor = None unprocessed = False if template is not None and hasattr(template, 'descriptor'): descriptor = template.descriptor() if hasattr(descriptor, 'name'): item_cls_name = descriptor.name elif hasattr(descriptor, 'get'): item_cls_name = descriptor.get('name', descriptor.get('display_name')) else: item_cls_name = '' else: unprocessed = True try: descriptor = self.schema_descriptors[template.id] item_cls_name = self.template_scrapes[template.id] except AttributeError: descriptor = sorted(self.schema_descriptors.items())[0][1] item_cls_name = sorted(self.template_scrapes.items())[0][1] item_cls = self.item_classes.get(item_cls_name) items = [] for processed_attributes in extracted_data or []: if processed_attributes.get('_type') in self.item_classes: _type = processed_attributes['_type'] item = self.item_classes[_type](processed_attributes) item['_type'] = item.display_name() elif unprocessed: item = self._process_attributes(processed_attributes, descriptor, htmlpage) if item_cls: item = item_cls(item) elif item_cls: item = item_cls(processed_attributes) else: item = dict(processed_attributes) item['url'] = htmlpage.url item['_template'] = str(template.id) item.setdefault('_type', item_cls_name) if not isinstance(item, SlybotItem): default_meta = { 'type': 'text', 'required': False, 'vary': False } item_cls = SlybotItem.create_iblitem_class( {'fields': {k: default_meta for k in item}}) item = item_cls(**item) if self.clustering: item['_template_cluster'] = template_cluster items.append(item) return items, link_regions def _process_attributes(self, item, descriptor, htmlpage): new_item = {} try: attr_map = descriptor.attribute_map except AttributeError: attr_map = {} page = getattr(htmlpage, 'htmlpage', htmlpage) for field, value in item.items(): if field.startswith('_sticky'): continue if field == 'variants': value = [ self._process_attributes(v, descriptor, page) for v in value ] elif field in attr_map: value = [attr_map[field].adapt(v, page) for v in value] new_item[field] = value return new_item def build_url_filter(self, spec): """make a filter for links""" respect_nofollow = spec.get('respect_nofollow', True) if spec.get("links_to_follow") == "none": url_filterf = lambda x: False elif spec.get("links_to_follow") == "all": if respect_nofollow: url_filterf = lambda x: x.nofollow else: url_filterf = lambda x: True else: # patterns patterns = spec.get('follow_patterns') excludes = spec.get('exclude_patterns') pattern_fn = include_exclude_filter(patterns, excludes) if respect_nofollow: url_filterf = lambda x: not x.nofollow and pattern_fn(x.url) else: url_filterf = lambda x: pattern_fn(x.url) self.url_filterf = url_filterf def _filter_link(self, link, seen): url = link.url if self.url_filterf(link): # filter out duplicate urls, later we should handle link text if url not in seen: seen.add(url) request = Request(url) if link.text: request.meta['link_text'] = link.text return request def _process_link_regions(self, htmlpage, link_regions): """Process link regions if any, and generate requests""" if link_regions: for link_region in link_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, link_region, encoding=htmlpage.encoding) for request in self._requests_to_follow(htmlregion): yield request else: for request in self._requests_to_follow(htmlpage): yield request def _requests_to_follow(self, htmlpage): if self._links_ibl_extractor is not None: extracted = self._links_ibl_extractor.extract(htmlpage)[0] if extracted: extracted_regions = extracted[0].get('_links', []) seen = set() for region in extracted_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding) for request in self._request_to_follow_from_region( htmlregion): if request.url in seen: continue seen.add(request.url) yield request else: for request in self._request_to_follow_from_region(htmlpage): yield request def _request_to_follow_from_region(self, htmlregion): seen = set() for link in self.html_link_extractor.links_to_follow(htmlregion): request = self._filter_link(link, seen) if request is not None: yield request def handle_xml(self, response, seen): _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', '')) _type = _type.groupdict()['type'] if _type else 'xml' try: link_extractor = create_linkextractor_from_specs({ 'type': _type, 'value': '' }) except ValueError: link_extractor = SitemapLinkExtractor() for link in link_extractor.links_to_follow(response): request = self._filter_link(link, seen) if request: yield request
def setup_bot(self, settings, spec, items, extractors, logger): """ Perform any initialization needed for crawling using this plugin """ self.logger = logger templates = map(self._get_annotated_template, spec['templates']) _item_template_pages = sorted(([ t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0') ] for t in templates if t.get('page_type', 'item') == 'item'), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = { template.get('page_id'): template['scrapes'] for template in templates } if (settings.get('AUTO_PAGINATION') or spec.get('links_to_follow') == 'auto'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor( schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby( sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor([ (page, scrapes['#default']) for page, scrapes, version in group ])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in templates if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) # Clustering self.template_names = [t.get('page_id') for t in spec['templates']] if settings.get('PAGE_CLUSTERING'): try: import page_clustering self.clustering = page_clustering.kmeans_from_samples( spec['templates']) self.logger.info("Clustering activated") except ImportError: self.clustering = None self.logger.warning( "Clustering could not be used because it is not installed") else: self.clustering = None
def setup_bot(self, settings, spec, items, extractors, logger): """ Perform any initialization needed for crawling using this plugin """ self.logger = logger templates = map(self._get_annotated_template, spec['templates']) _item_template_pages = sorted(( [t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0')] for t in templates if t.get('page_type', 'item') == 'item' ), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = {template.get('page_id'): template['scrapes'] for template in templates} if (settings.get('AUTO_PAGINATION') or spec.get('links_to_follow') == 'auto'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor(schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby(sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor( [(page, scrapes['#default']) for page, scrapes, version in group])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in templates if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) # Clustering self.template_names = [t.get('page_id') for t in spec['templates']] if settings.get('PAGE_CLUSTERING'): try: import page_clustering self.clustering = page_clustering.kmeans_from_samples(spec['templates']) self.logger.info("Clustering activated") except ImportError: self.clustering = None self.logger.warning( "Clustering could not be used because it is not installed") else: self.clustering = None
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] for rdata in spec.get("init_requests", []): if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.generic_form = GenericForm(**kw) self.form_requests.append(self.get_generic_form_start_request(rdata))
class Annotations(object): """ Base Class for adding plugins to Portia Web and Slybot. """ def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(( [t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0')] for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = {template.get('page_id'): template['scrapes'] for template in spec.get('templates')} self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor(schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) grouped = itertools.groupby(sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor( [(page, scrapes['#default']) for page, scrapes, version in group])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) def handle_html(self, response, seen=None): htmlpage = htmlpage_from_response(response) items, link_regions = self.extract_items(htmlpage) htmlpage.headers['n_items'] = len(items) try: response.meta['n_items'] = len(items) except AttributeError: pass # response not tied to any request for item in items: yield item for request in self._process_link_regions(htmlpage, link_regions): yield request def extract_items(self, htmlpage): """This method is also called from UI webservice to extract items""" for extractor in self.extractors: items, links = self._do_extract_items_from(htmlpage, extractor) if items: return items, links return [], [] def _do_extract_items_from(self, htmlpage, extractor): extracted_data, template = extractor.extract(htmlpage) link_regions = [] for ddict in extracted_data or []: link_regions.extend(ddict.pop("_links", [])) descriptor = None unprocessed = False if template is not None and hasattr(template, 'descriptor'): descriptor = template.descriptor() item_cls_name = descriptor.name if descriptor is not None else '' else: unprocessed = True try: descriptor = self.schema_descriptors[template.id] item_cls_name = self.template_scrapes[template.id] except AttributeError: descriptor = sorted(self.schema_descriptors.items())[0][1] item_cls_name = sorted(self.template_scrapes.items())[0][1] item_cls = self.item_classes.get(item_cls_name) items = [] for processed_attributes in extracted_data or []: if processed_attributes.get('_type') in self.item_classes: _type = processed_attributes['_type'] item = self.item_classes[_type](processed_attributes) item['_type'] = item.display_name() elif unprocessed: item = self._process_attributes(processed_attributes, descriptor, htmlpage) if item_cls: item = item_cls(item) elif item_cls: item = item_cls(processed_attributes) else: item = dict(processed_attributes) item['url'] = htmlpage.url item['_template'] = str(template.id) item.setdefault('_type', item_cls_name) if not isinstance(item, SlybotItem): default_meta = {'type': 'text', 'required': False, 'vary': False} item_cls = SlybotItem.create_iblitem_class( {'fields': {k: default_meta for k in item}} ) item = item_cls(**item) items.append(item) return items, link_regions def _process_attributes(self, item, descriptor, htmlpage): new_item = {} attr_map = descriptor.attribute_map for field, value in item.items(): if field.startswith('_sticky'): continue if field == 'variants': value = [self._process_attributes(v, descriptor, htmlpage) for v in value] elif field in attr_map: value = [attr_map[field].adapt(v, htmlpage) for v in value] new_item[field] = value return new_item def build_url_filter(self, spec): """make a filter for links""" respect_nofollow = spec.get('respect_nofollow', True) if spec.get("links_to_follow") == "none": url_filterf = lambda x: False elif spec.get("links_to_follow") == "all": if respect_nofollow: url_filterf = lambda x: x.nofollow else: url_filterf = lambda x: True else: # patterns patterns = spec.get('follow_patterns') excludes = spec.get('exclude_patterns') pattern_fn = include_exclude_filter(patterns, excludes) if respect_nofollow: url_filterf = lambda x: not x.nofollow and pattern_fn(x.url) else: url_filterf = lambda x: pattern_fn(x.url) self.url_filterf = url_filterf def _filter_link(self, link, seen): url = link.url if self.url_filterf(link): # filter out duplicate urls, later we should handle link text if url not in seen: seen.add(url) request = Request(url) if link.text: request.meta['link_text'] = link.text return request def _process_link_regions(self, htmlpage, link_regions): """Process link regions if any, and generate requests""" if link_regions: for link_region in link_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, link_region, encoding=htmlpage.encoding) for request in self._requests_to_follow(htmlregion): yield request else: for request in self._requests_to_follow(htmlpage): yield request def _requests_to_follow(self, htmlpage): if self._links_ibl_extractor is not None: extracted = self._links_ibl_extractor.extract(htmlpage)[0] if extracted: extracted_regions = extracted[0].get('_links', []) seen = set() for region in extracted_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding) for request in self._request_to_follow_from_region( htmlregion): if request.url in seen: continue seen.add(request.url) yield request else: for request in self._request_to_follow_from_region(htmlpage): yield request def _request_to_follow_from_region(self, htmlregion): seen = set() for link in self.html_link_extractor.links_to_follow(htmlregion): request = self._filter_link(link, seen) if request is not None: yield request def handle_xml(self, response, seen): _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', '')) _type = _type.groupdict()['type'] if _type else 'xml' try: link_extractor = create_linkextractor_from_specs({ 'type': _type, 'value': '' }) except ValueError: link_extractor = SitemapLinkExtractor() for link in link_extractor.links_to_follow(response): request = self._filter_link(link, seen) if request: yield request
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec['scrapes'] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, \ log.WARNING) self._item_template_pages = sorted(( [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({ 'id': "_links", 'properties': () }) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor( self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby( self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class( schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor( schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] for rdata in spec.get("init_requests", []): if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page) self.login_requests.append(request) elif rdata["type"] == "form": request = Request(url=rdata.pop("form_url"), meta=rdata, callback=self.parse_form_page) self.form_requests.append(request)
class Annotations(object): """ Base Class for adding plugins to Portia Web and Slybot. """ def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) self.itemcls_info = {} if settings.get('AUTO_PAGINATION'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)): page_extractors_pairs = map(itemgetter(1, 2), triplets) schema = items[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) def handle_html(self, response, seen=None): htmlpage = htmlpage_from_response(response) items, link_regions = self.extract_items(htmlpage) htmlpage.headers['n_items'] = len(items) try: response.meta['n_items'] = len(items) except AttributeError: pass # response not tied to any request for item in items: yield item for request in self._process_link_regions(htmlpage, link_regions): yield request def extract_items(self, htmlpage): """This method is also called from UI webservice to extract items""" items = [] link_regions = [] for item_cls_name, info in self.itemcls_info.items(): item_descriptor = info['descriptor'] extractor = info['extractor'] extracted, _link_regions = self._do_extract_items_from( htmlpage, item_descriptor, extractor, item_cls_name, ) items.extend(extracted) link_regions.extend(_link_regions) return items, link_regions def _do_extract_items_from(self, htmlpage, item_descriptor, extractor, item_cls_name): extracted_data, template = extractor.extract(htmlpage) link_regions = [] for ddict in extracted_data or []: link_regions.extend(ddict.pop("_links", [])) processed_data = _process_extracted_data(extracted_data, item_descriptor, htmlpage) items = [] item_cls = self.itemcls_info[item_cls_name]['class'] for processed_attributes in processed_data: item = item_cls(processed_attributes) item['url'] = htmlpage.url item['_type'] = item_cls_name item['_template'] = str(template.id) items.append(item) return items, link_regions def build_url_filter(self, spec): """make a filter for links""" respect_nofollow = spec.get('respect_nofollow', True) patterns = spec.get('follow_patterns') if spec.get("links_to_follow") == "none": url_filterf = lambda x: False elif patterns: pattern = patterns[0] if len(patterns) == 1 \ else "(?:%s)" % '|'.join(patterns) follow_pattern = re.compile(pattern) if respect_nofollow: url_filterf = lambda x: follow_pattern.search(x.url) \ and not x.nofollow else: url_filterf = lambda x: follow_pattern.search(x.url) elif respect_nofollow: url_filterf = lambda x: not x.nofollow else: url_filterf = bool # apply exclude patterns excludes = spec.get('exclude_patterns') if excludes: pattern = excludes[0] if len(excludes) == 1 \ else "(?:%s)" % '|'.join(excludes) exclude_pattern = re.compile(pattern) self.url_filterf = lambda x: not exclude_pattern.search(x.url) \ and url_filterf(x) else: self.url_filterf = url_filterf def _filter_link(self, link, seen): url = link.url if self.url_filterf(link): # filter out duplicate urls, later we should handle link text if url not in seen: seen.add(url) request = Request(url) if link.text: request.meta['link_text'] = link.text return request def _process_link_regions(self, htmlpage, link_regions): """Process link regions if any, and generate requests""" if link_regions: for link_region in link_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, link_region, encoding=htmlpage.encoding) for request in self._requests_to_follow(htmlregion): yield request else: for request in self._requests_to_follow(htmlpage): yield request def _requests_to_follow(self, htmlpage): if self._links_ibl_extractor is not None: extracted = self._links_ibl_extractor.extract(htmlpage)[0] if extracted: extracted_regions = extracted[0].get('_links', []) seen = set() for region in extracted_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding) for request in self._request_to_follow_from_region( htmlregion): if request.url in seen: continue seen.add(request.url) yield request else: for request in self._request_to_follow_from_region(htmlpage): yield request def _request_to_follow_from_region(self, htmlregion): seen = set() for link in self.html_link_extractor.links_to_follow(htmlregion): request = self._filter_link(link, seen) if request is not None: yield request def handle_xml(self, response, seen): _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', '')) _type = _type.groupdict()['type'] if _type else 'xml' try: link_extractor = create_linkextractor_from_specs({ 'type': _type, 'value': '' }) except ValueError: link_extractor = SitemapLinkExtractor() for link in link_extractor.links_to_follow(response): request = self._filter_link(link, seen) if request: yield request
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(([ t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0') ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = { template.get('page_id'): template['scrapes'] for template in spec.get('templates') } self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor( schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby( sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor([ (page, scrapes['#default']) for page, scrapes, version in group ])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec['scrapes'] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, \ log.WARNING) self._item_template_pages = sorted(( [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == '_links'] _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self._fpages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type', 'item') == 'form' ] self._start_urls = spec.get('start_urls') self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) # make a filter for links respect_nofollow = spec.get('respect_nofollow', True) patterns = spec.get('follow_patterns') if patterns: pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % '|'.join(patterns) follow_pattern = re.compile(pattern) if respect_nofollow: url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow else: url_filterf = lambda x: follow_pattern.search(x.url) elif respect_nofollow: url_filterf = lambda x: not x.nofollow else: url_filterf = bool # apply exclude patterns exclude_patterns = spec.get('exclude_patterns') if exclude_patterns: pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % '|'.join(exclude_patterns) exclude_pattern = re.compile(pattern) self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x) else: self.url_filterf = url_filterf default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor(self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self._itemversion_cache = {}
class Annotations(object): """ Base Class for adding plugins to Portia Web and Slybot. """ def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(([ t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', []) ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=lambda pair: pair[0]) self.itemcls_info = {} self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)): page_extractors_pairs = map(itemgetter(1, 2), triplets) schema = items[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) def handle_html(self, response): htmlpage = htmlpage_from_response(response) items, link_regions = self.extract_items(htmlpage) for item in items: yield item for request in self._process_link_regions(htmlpage, link_regions): yield request def extract_items(self, htmlpage): """This method is also called from UI webservice to extract items""" items = [] link_regions = [] for item_cls_name, info in self.itemcls_info.iteritems(): item_descriptor = info['descriptor'] extractor = info['extractor'] extracted, _link_regions = self._do_extract_items_from( htmlpage, item_descriptor, extractor, item_cls_name, ) items.extend(extracted) link_regions.extend(_link_regions) return items, link_regions def _do_extract_items_from(self, htmlpage, item_descriptor, extractor, item_cls_name): extracted_data, template = extractor.extract(htmlpage) link_regions = [] for ddict in extracted_data or []: link_regions.extend(ddict.pop("_links", [])) processed_data = _process_extracted_data(extracted_data, item_descriptor, htmlpage) items = [] item_cls = self.itemcls_info[item_cls_name]['class'] for processed_attributes in processed_data: item = item_cls(processed_attributes) item['url'] = htmlpage.url item['_type'] = item_cls_name item['_template'] = str(template.id) items.append(item) return items, link_regions def build_url_filter(self, spec): """make a filter for links""" respect_nofollow = spec.get('respect_nofollow', True) patterns = spec.get('follow_patterns') if spec.get("links_to_follow") == "none": url_filterf = lambda x: False elif patterns: pattern = patterns[0] if len(patterns) == 1 \ else "(?:%s)" % '|'.join(patterns) follow_pattern = re.compile(pattern) if respect_nofollow: url_filterf = lambda x: follow_pattern.search(x.url) \ and not x.nofollow else: url_filterf = lambda x: follow_pattern.search(x.url) elif respect_nofollow: url_filterf = lambda x: not x.nofollow else: url_filterf = bool # apply exclude patterns excludes = spec.get('exclude_patterns') if excludes: pattern = excludes[0] if len(excludes) == 1 \ else "(?:%s)" % '|'.join(excludes) exclude_pattern = re.compile(pattern) self.url_filterf = lambda x: not exclude_pattern.search(x.url) \ and url_filterf(x) else: self.url_filterf = url_filterf def _filter_link(self, link, seen): url = link.url if self.url_filterf(link): # filter out duplicate urls, later we should handle link text if url not in seen: seen.add(url) request = Request(url) if link.text: request.meta['link_text'] = link.text return request def _process_link_regions(self, htmlpage, link_regions): """Process link regions if any, and generate requests""" if link_regions: for link_region in link_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, link_region, encoding=htmlpage.encoding) for request in self._requests_to_follow(htmlregion): yield request else: for request in self._requests_to_follow(htmlpage): yield request def _requests_to_follow(self, htmlpage): if self._links_ibl_extractor is not None: extracted = self._links_ibl_extractor.extract(htmlpage)[0] if extracted: extracted_regions = extracted[0].get('_links', []) seen = set() for region in extracted_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding) for request in self._request_to_follow_from_region( htmlregion): if request.url in seen: continue seen.add(request.url) yield request else: for request in self._request_to_follow_from_region(htmlpage): yield request def _request_to_follow_from_region(self, htmlregion): seen = set() for link in self.html_link_extractor.links_to_follow(htmlregion): request = self._filter_link(link, seen) if request is not None: yield request def handle_rss(self, response, seen): for link in self.rss_link_extractor.links_to_follow(response): request = self._filter_link(link, seen) if request: yield request