def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(( [t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0')] for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = {template.get('page_id'): template['scrapes'] for template in spec.get('templates')} self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor(schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby(sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor( [(page, scrapes['#default']) for page, scrapes, version in group])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] for rdata in spec.get("init_requests", []): if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.generic_form = GenericForm(**kw) self.form_requests.append(self.get_generic_form_start_request(rdata))
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']: val = val.splitlines() spec[key] = val self.i = time.time() self.getProxyList() self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec['scrapes'] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, \ log.WARNING) self._item_template_pages = sorted(( [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self._fpages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type', 'item') == 'form' ] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor(self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, }
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']: val = val.splitlines() spec[key] = val self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(([ t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', []) ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=lambda pair: pair[0]) self.itemcls_info = {} if settings.get('AUTO_PAGINATION'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)): page_extractors_pairs = map(itemgetter(1, 2), triplets) schema = items[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def test_extractor_w_empty_string_extraction(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "([0-9]+)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template2, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target2)[0][0]['name'], [u'Name Olivia'])
def test_type_extractor(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "type_extractor": "text" }, 2: { "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, {"gender": [1, 2]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def test_extractor_w_empty_string_extraction(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "([0-9]+)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
def test_type_extractor(self): schema = { "id": "test", "properties": [('gender', { 'description': '', 'optional': True, 'type': 'number', 'vary': False, })], } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "_id": 1, "field_name": "gender", "type_extractor": "text" }, 2: { "_id": 2, "field_name": "gender", "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, [1, 2], extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def test_extractor_w_empty_string_extraction(self): schema = { "id": "test", "properties": [ ('gender', { 'description': '', 'optional': True, 'type': 'text', 'vary': False, }), ('name', { 'description': '', 'optional': False, 'type': 'text', 'vary': False, }), ], } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "_id": 2, "field_name": "gender", "regular_expression": "([0-9]+)" } } apply_extractors(descriptor, [1], extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
def test_type_extractor(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "type_extractor": "text" }, 2: { "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, {"gender": [1, 2]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) self.itemcls_info = {} if settings.get('AUTO_PAGINATION'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)): page_extractors_pairs = map(itemgetter(1, 2), triplets) schema = items[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted( ( [t["scrapes"], dict_to_page(t, "annotated_body"), t.get("extractors", [])] for t in spec["templates"] if t.get("page_type", "item") == "item" ), key=lambda pair: pair[0], ) self.itemcls_info = {} self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)): page_extractors_pairs = map(itemgetter(1, 2), triplets) schema = items[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { "class": item_cls, "descriptor": item_descriptor, "extractor": extractor, } # generate ibl extractor for links pages _links_pages = [dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type") == "links"] _links_item_descriptor = create_slybot_item_descriptor({"fields": {}}) self._links_ibl_extractor = ( InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) if _links_pages else None ) self.build_url_filter(spec)
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(( [t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] for t in spec['templates'] if t.get('page_type', 'item') == 'item' )) self.item_classes = {} self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] for default, template, template_extractors in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor(schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) page_descriptor_pairs.append((template, descriptors)) self.extractors = SlybotIBLExtractor(page_descriptor_pairs) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def test_default_type_extractor(self): schema = {'fields': {}} descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def test_per_annotation_extractors(self): schema = { 'fields': { 'url': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } extractors = { '1': { 'type_extractor': 'url' }, '2': { 'regular_expression': '(.*)\.html' }, '3': { 'regular_expression': 'Name: (.*)' }, '4': { 'type_extractor': 'text' }, '5': { 'type_extractor': 'price' }, '6': { 'type_extractor': 'number' }, '7': { 'type_extractor': 'date' }, '8': { 'regular_expression': '(\d+)-' } } descriptors = {'#default': create_slybot_item_descriptor(schema)} add_extractors_to_descriptors(descriptors, extractors) ibl_extractor = SlybotIBLExtractor([(self.template3, descriptors, '0.13.0')]) result = { u'_template': '6223d000057491040e4f411cf1f0734ea802eeb6', 'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'], 'title': [u'Name: Olivia'], 'price': [u'2016'], 'date': [datetime(2016, 3, 17, 20, 25)] } data = ibl_extractor.extract(self.target3)[0][0] self.assertEqual(data, result)
def test_default_type_extractor(self): schema = {'fields': {}} descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
def test_per_annotation_extractors(self): schema = { 'fields': { 'url': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } extractors = { '1': { 'type_extractor': 'url' }, '2': { 'regular_expression': '(.*)\.html' }, '3': { 'regular_expression': 'Name: (.*)' }, '4': { 'type_extractor': 'text' }, '5': { 'type_extractor': 'price' }, '6': { 'type_extractor': 'number' }, '7': { 'type_extractor': 'date' }, '8': { 'regular_expression': '(\d+)-' } } descriptors = {'#default': create_slybot_item_descriptor(schema)} add_extractors_to_descriptors(descriptors, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template3, descriptors, '0.13.0') ]) result = {'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'], 'title': [u'Name: Olivia'], 'price': [u'2016'], 'date': [datetime(2016, 3, 17, 20, 25)]} data = ibl_extractor.extract(self.target3)[0][0] del data['_template'] self.assertEqual(data, result)
def test_default_type_extractor(self): schema = { 'fields': {} } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"regular_expression": "Gender\\s+(Male|Female)"} } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
def test_default_type_extractor(self): schema = { 'fields': {} } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def test_extract_single_attribute_to_multiple_fields(self): extractors = {'1': {'regular_expression': '(.*)\s'}, '2': {'regular_expression': '\s(.*)'}} descriptors = {'#default': create_slybot_item_descriptor({'fields': { 'full_name': {'type': 'text', 'required': False, 'vary': False}, 'first_name': {'type': 'text', 'required': False, 'vary': False, 'name': u'prénom'}, 'last_name': {'type': 'text', 'required': False, 'vary': False, 'name': 'nom'}, 'address': {'type': 'text', 'required': False, 'vary': False}}})} add_extractors_to_descriptors(descriptors, extractors) extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')]) data = extractor.extract(page_411)[0][1] self.assertEqual(data['full_name'], [u'Joe Smith']) self.assertEqual(data[u'prénom'], [u'Joe']) self.assertEqual(data['nom'], [u'Smith'])
def test_extract_single_attribute_to_multiple_fields(self): extractors = {'1': {'regular_expression': '(.*)\s'}, '2': {'regular_expression': '\s(.*)'}} descriptors = {'#default': create_slybot_item_descriptor({'fields': { 'full_name': {'type': 'text', 'required': False, 'vary': False}, 'first_name': {'type': 'text', 'required': False, 'vary': False, 'name': u'prénom'}, 'last_name': {'type': 'text', 'required': False, 'vary': False, 'name': 'nom'}, 'address': {'type': 'text', 'required': False, 'vary': False}}})} add_extractors_to_descriptors(descriptors, extractors) extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')]) data = extractor.extract(page_411)[0] self.assertEqual(data[1]['full_name'], [u'Joe Smith']) self.assertEqual(data[1][u'prénom'], [u'Joe']) self.assertEqual(data[1]['nom'], [u'Smith'])
def test_negative_hit_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0], None)
def test_text_type_w_regex(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
def test_default_type_extractor(self): schema = { "id": "test", "properties": [], } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "_id": 1, "field_name": "gender", "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, [1], extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def test_text_type_w_regex_and_no_groups(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Gender']})
def test_text_type_w_regex(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: { "regular_expression": "Gender\\s+(Male|Female)" }} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
def test_raw_type_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'raw', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: { "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)" }} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
def test_text_type_w_regex_and_no_groups(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"regular_expression": "Gender"} } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Gender'])
def test_raw_type_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'raw', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"} } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>'])
def test_raw_type_w_regex(self): schema = { "id": "test", "properties": [('gender', { 'description': '', 'optional': True, 'type': 'raw', 'vary': False, })], } descriptor = create_slybot_item_descriptor(schema) extractors = {1: { "_id": 1, "field_name": "gender", "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)" }} apply_extractors(descriptor, [1], extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
def test_raw_type_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'raw', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
def test_extractor_w_empty_string_extraction(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "([0-9]+)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)]) self.assertEqual( ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
def setup_bot(self, settings, spec, items, extractors, logger): """ Perform any initialization needed for crawling using this plugin """ self.logger = logger templates = map(self._get_annotated_template, spec['templates']) _item_template_pages = sorted(([ t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0') ] for t in templates if t.get('page_type', 'item') == 'item'), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = { template.get('page_id'): template['scrapes'] for template in templates } if (settings.get('AUTO_PAGINATION') or spec.get('links_to_follow') == 'auto'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor( schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby( sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor([ (page, scrapes['#default']) for page, scrapes, version in group ])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in templates if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) # Clustering self.template_names = [t.get('page_id') for t in spec['templates']] if settings.get('PAGE_CLUSTERING'): try: import page_clustering self.clustering = page_clustering.kmeans_from_samples( spec['templates']) self.logger.info("Clustering activated") except ImportError: self.clustering = None self.logger.warning( "Clustering could not be used because it is not installed") else: self.clustering = None
item = {k: v for k, v in item.items() if v} validated = validate(item, html_page) if not validated: continue if hasattr(validated, 'dump'): validated = validated.dump() validated['_template'] = None items.append(validated) items = list(filter(bool, items)) return [i for i in items if '_type' in i] _PATH = dirname(__file__) td = TokenDict() with open('%s/data/SampleProject/items.json' % _PATH) as f: items = json.load(f) descriptors = {'#default': create_slybot_item_descriptor(items['default'], 'default')} class FakeContainer(BaseContainerExtractor): def __init__(self, schema, legacy=False): self.schema = schema self.extra_requires = [] self.legacy = legacy self.modifiers = {} schema = FakeContainer(descriptors['#default']) validate = schema._validate_and_adapt_item _names_map = {'daft_ie': 'daft', 'patchofland': 'pol'} ibl_extractors = {} ibl_pages = {}
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec['scrapes'] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, \ log.WARNING) self._item_template_pages = sorted(( [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == '_links'] _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self._fpages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type', 'item') == 'form' ] self._start_urls = spec.get('start_urls') self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) # make a filter for links respect_nofollow = spec.get('respect_nofollow', True) patterns = spec.get('follow_patterns') if patterns: pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % '|'.join(patterns) follow_pattern = re.compile(pattern) if respect_nofollow: url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow else: url_filterf = lambda x: follow_pattern.search(x.url) elif respect_nofollow: url_filterf = lambda x: not x.nofollow else: url_filterf = bool # apply exclude patterns exclude_patterns = spec.get('exclude_patterns') if exclude_patterns: pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % '|'.join(exclude_patterns) exclude_pattern = re.compile(pattern) self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x) else: self.url_filterf = url_filterf default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor(self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self._itemversion_cache = {}
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec['scrapes'] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, \ log.WARNING) self._item_template_pages = sorted(( [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({ 'id': "_links", 'properties': () }) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor( self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby( self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class( schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor( schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] for rdata in spec.get("init_requests", []): if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page) self.login_requests.append(request) elif rdata["type"] == "form": request = Request(url=rdata.pop("form_url"), meta=rdata, callback=self.parse_form_page) self.form_requests.append(request)
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] for rdata in spec.get("init_requests", []): if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.generic_form = GenericForm(**kw) self.form_requests.append(self.get_generic_form_start_request(rdata))
def setup_bot(self, settings, spec, items, extractors, logger): """ Perform any initialization needed for crawling using this plugin """ self.logger = logger templates = map(self._get_annotated_template, spec['templates']) _item_template_pages = sorted(( [t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0')] for t in templates if t.get('page_type', 'item') == 'item' ), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = {template.get('page_id'): template['scrapes'] for template in templates} if (settings.get('AUTO_PAGINATION') or spec.get('links_to_follow') == 'auto'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor(schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby(sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor( [(page, scrapes['#default']) for page, scrapes, version in group])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in templates if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) # Clustering self.template_names = [t.get('page_id') for t in spec['templates']] if settings.get('PAGE_CLUSTERING'): try: import page_clustering self.clustering = page_clustering.kmeans_from_samples(spec['templates']) self.logger.info("Clustering activated") except ImportError: self.clustering = None self.logger.warning( "Clustering could not be used because it is not installed") else: self.clustering = None
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(([ t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0') ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = { template.get('page_id'): template['scrapes'] for template in spec.get('templates') } self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor( schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby( sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor([ (page, scrapes['#default']) for page, scrapes, version in group ])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) default_item = spec["scrapes"] self._default_schema = item_schemas[default_item] if not self._default_schema: self.log("Scraping unknown default item schema: %s" % default_item, log.WARNING) self._item_template_pages = sorted( ( [t.get("scrapes", default_item), dict_to_page(t, "annotated_body"), t.get("extractors", [])] for t in spec["templates"] if t.get("page_type", "item") == "item" ), key=lambda pair: pair[0], ) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type") == "links"] _links_item_descriptor = create_slybot_item_descriptor({"id": "_links", "properties": ()}) self._links_ibl_extractor = ( InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) if _links_pages else None ) self._ipages = [page for _, page, _ in self._item_template_pages] self._fpages = [ dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type", "item") == "form" ] self.start_urls = self.start_urls or spec.get("start_urls") if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.link_extractor = LinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) default_item_cls = get_iblitem_class(self._default_schema) default_item_descriptor = create_slybot_item_descriptor(self._default_schema) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) if schema else default_item_cls page_descriptor_pairs = [] for page, extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor apply_extractors(item_descriptor, extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { "class": item_cls, "descriptor": item_descriptor, "extractor": extractor, } self._itemversion_cache = {}