def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']: val = val.splitlines() spec[key] = val self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(([ t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', []) ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=lambda pair: pair[0]) self.itemcls_info = {} self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)): page_extractors_pairs = map(itemgetter(1, 2), triplets) schema = items[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] for rdata in spec.get("init_requests", []): if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.generic_form = GenericForm(**kw) self.form_requests.append(self.get_generic_form_start_request(rdata))
def test_rss(self): lextractor = RssLinkExtractor() links = list(lextractor.links_to_follow(self.response)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, 'http://www.wikipedia.org/')