def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec['scrapes'] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, \ log.WARNING) self._item_template_pages = sorted(( [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self._fpages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type', 'item') == 'form' ] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor(self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, }
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec["scrapes"] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, log.WARNING) self._item_template_pages = sorted( ( [t.get("scrapes", default_item), dict_to_page(t, "annotated_body"), t.get("extractors", [])] for t in spec["templates"] if t.get("page_type", "item") == "item" ), key=lambda pair: pair[0], ) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type") == "links"] _links_item_descriptor = create_slybot_item_descriptor({"id": "_links", "properties": ()}) self._links_ibl_extractor = ( InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) if _links_pages else None ) self._ipages = [page for _, page, _ in self._item_template_pages] self._fpages = [ dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type", "item") == "form" ] self.start_urls = self.start_urls or spec.get("start_urls") if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor(self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { "class": item_cls, "descriptor": item_descriptor, "extractor": extractor, } self._itemversion_cache = {}
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec['scrapes'] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, \ log.WARNING) self._item_template_pages = sorted(( [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == '_links'] _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self._fpages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type', 'item') == 'form' ] self._start_urls = spec.get('start_urls') self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) # make a filter for links respect_nofollow = spec.get('respect_nofollow', True) patterns = spec.get('follow_patterns') if patterns: pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % '|'.join(patterns) follow_pattern = re.compile(pattern) if respect_nofollow: url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow else: url_filterf = lambda x: follow_pattern.search(x.url) elif respect_nofollow: url_filterf = lambda x: not x.nofollow else: url_filterf = bool # apply exclude patterns exclude_patterns = spec.get('exclude_patterns') if exclude_patterns: pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % '|'.join(exclude_patterns) exclude_pattern = re.compile(pattern) self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x) else: self.url_filterf = url_filterf default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor(self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self._itemversion_cache = {}