def _run_extraction(self, name, templates, page, descriptor, expected_output): self.trace = None template_pages = [HtmlPage(None, {}, t) for t in templates] # extracts with trace enabled in order to generate traceback extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True) actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) if actual_output is not None: actual_output = actual_output[0] self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace') # extracts again with trace disabled in order to get the pure output extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) if actual_output is None: if expected_output is None: return assert False, "failed to extract data for test '%s'" % name else: actual_output = actual_output[0] expected_names = set(expected_output.keys()) actual_names = set(actual_output.keys()) missing_in_output = filter(None, expected_names - actual_names) error = "attributes '%s' were expected but were not present in test '%s'" % \ ("', '".join(missing_in_output), name) assert len(missing_in_output) == 0, error unexpected = actual_names - expected_names error = "unexpected attributes %s in test '%s'" % \ (', '.join(unexpected), name) assert len(unexpected) == 0, error for k, v in expected_output.items(): extracted = actual_output[k] assert v == extracted, "in test '%s' for attribute '%s', " \ "expected value '%s' but got '%s'" % (name, k, v, extracted)
def test_annotation(self): html_page = HtmlPage(body=TEST_PAGE) template = { 'original_body': html_page.body } data = { 'extracts': [ { 'annotations': {'href': 'origin'}, 'id': 'test-id-123', 'required': [], 'tagid': 123, 'variant': 0 } ] } annotations = Annotations() annotations.save_extraction_data(data, template) sample = HtmlPage(body=add_tagids(template['annotated_body'])) for element in sample.parsed_body: if isinstance(element, HtmlTag): tagid = element.attributes.get(TAGID, None) if tagid and int(tagid) == data['extracts'][0]['tagid']: annotation = element.attributes.get('data-scrapy-annotate') self.assertTrue(annotation) self.assertTrue('"id": "test-id-123"')
def _open_sample_and_page(name): sample_spec = _open_spec(name) annotations = sample_spec['plugins']['annotations-plugin']['extracts'] annotated = apply_annotations(_clean_annotation_data(annotations), sample_spec['original_body']) url = sample_spec['url'] return (HtmlPage(url=url, body=annotated), HtmlPage(url=url, body=sample_spec['original_body']))
def test_extraction(self, name, templates, page, descriptor, expected_output): template_pages = [HtmlPage(None, {}, t) for t in templates] extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) self.assertEqual(expected_output, actual_output and actual_output[0])
def parse_strings(template_html, extraction_html): """Create a template and extraction page from raw strings this is useful for testing purposes """ t = TokenDict() template_page = HtmlPage(body=template_html) extraction_page = HtmlPage(body=extraction_html) return (parse_template(t, template_page), parse_extraction_page(t, extraction_page))
def test_copy(self): """Test copy/deepcopy""" page = HtmlPage(url='http://www.example.com', body=PAGE) region = page.subregion(10, 15) regioncopy = copy.copy(region) self.assertEqual(regioncopy.start_index, 10) self.assertEqual(regioncopy.end_index, 15) self.assertFalse(region is regioncopy) self.assertTrue(region.htmlpage is regioncopy.htmlpage) regiondeepcopy = copy.deepcopy(region) self.assertEqual(regiondeepcopy.start_index, 10) self.assertEqual(regiondeepcopy.end_index, 15) self.assertFalse(region is regiondeepcopy) self.assertFalse(region.htmlpage is regiondeepcopy.htmlpage)
def url_to_page_mod(self, website, encoding=None, default_encoding='utf-8'): '''this function has been modified to take a website object insteal of going to the url ''' """Fetch a URL, using python urllib2, and return an HtmlPage object. The `url` may be a string, or a `urllib2.Request` object. The `encoding` argument can be used to force the interpretation of the page encoding. Redirects are followed, and the `url` property of the returned HtmlPage object is the url of the final page redirected to. If the encoding of the page is known, it can be passed as a keyword argument. If unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`. `default_encoding` is used if the encoding cannot be determined. """ #fh = urllib2.urlopen(url) #info = fh.info() info = website.info headers_dict = dict(info.headers) #body_str = fh.read() body = website.browser.page_source # guess content encoding if not specified if encoding is None: encoding = default_encoding return HtmlPage(website.url, headers=headers_dict, body=body, encoding=encoding)
def test_get_base_url(self): """Basic get_base_url test""" html = u'<html><head><base href="http://example.com/products/" />\ <body></body></html>' page = HtmlPage("http://example.com/products/p19.html", body=html) self.assertEqual(get_base_url(page), "http://example.com/products/")
def test_spider_with_link_template(self): name = "seedsofchange" spider = self.smanager.create(name) spec = self.smanager._specs["spiders"][name] t1, t2 = spec["templates"] target1, target2 = [HtmlPage(url=t["url"], body=t["original_body"]) for t in spec["templates"]] items, link_regions = spider.plugins['Annotations'].extract_items(target1) self.assertEqual(items, []) self.assertEqual(len(list(spider.plugins['Annotations']._process_link_regions(target1, link_regions))), 104) items, link_regions = spider.plugins['Annotations'].extract_items(target2) self.assertEqual(items[0], { '_template': u'4fac3b47688f920c7800000f', '_type': u'default', u'category': [u'Winter Squash'], u'days': [None], u'description': [u'1-2 lbs. (75-95 days) This early, extremely productive, compact bush variety is ideal for small gardens. Miniature pumpkin-shaped fruits have pale red-orange skin and dry, sweet, dark orange flesh. Great for stuffing, soups and pies.'], u'lifecycle': [u'Tender Annual'], u'name': [u'Gold Nugget'], u'price': [u'3.49'], u'product_id': [u'01593'], u'species': [u'Cucurbita maxima'], 'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS14165', u'weight': [None]} ) self.assertEqual(link_regions, []) self.assertEqual(len(list(spider.plugins['Annotations']._process_link_regions(target2, link_regions))), 0)
def _requests_to_follow(self, htmlpage): if self._links_ibl_extractor is not None: #bugfix,self._links_ibl_extractor.extract will find a serious of link #when it was create by repeated annotation. extracted_list = self._links_ibl_extractor.extract(htmlpage)[0] if extracted_list is not None: seen = set() for extracted in extracted_list: #every key doesn't start with '_' is links attribute. for key in extracted.keys(): if not str(key).startswith('_'): extracted_regions = extracted.get(key, []) for region in extracted_regions: #if isinstance(region, six.string_types) : # region = region.decode(htmlpage.encoding) htmlregion = HtmlPage( htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding) for request in self._request_to_follow_from_region( htmlregion): if request.url in seen: continue seen.add(request.url) yield request else: for request in self._request_to_follow_from_region(htmlpage): yield request
def test_spider_with_link_region_but_not_link_template(self): name = "seedsofchange2" spider = self.smanager.create(name) spec = self.smanager._specs["spiders"][name] t1, t2 = spec["templates"] target1, target2 = [ HtmlPage(url=t["url"], body=t["original_body"]) for t in spec["templates"] ] items, link_regions = spider.plugins['Annotations'].extract_items( target1) self.assertEqual( items[0], { '_template': u'4fad6a7c688f922437000014', '_type': u'default', u'category': [u'Onions'], u'days': [None], u'description': [ u'(110-120 days) Midsized Italian variety. Long to intermediate day red onion that tolerates cool climates. Excellent keeper. We have grown out thousands of bulbs and re-selected this variety to be the top quality variety that it once was. 4-5" bulbs are top-shaped, uniformly colored, and have tight skins.' ], u'lifecycle': [u'Heirloom/Rare'], u'name': [u'Rossa Di Milano Onion'], u'price': [u'3.49'], u'species': [u'Alium cepa'], u'type': [u'Heirloom/Rare'], 'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS15978' }) self.assertEqual(link_regions, []) items, link_regions = spider.plugins['Annotations'].extract_items( target2) self.assertEqual( items[0], { '_template': u'4fad6a7d688f922437000017', '_type': u'default', u'category': [u'Winter Squash'], u'days': [None], u'description': [ u'1-2 lbs. (75-95 days) This early, extremely productive, compact bush variety is ideal for small gardens. Miniature pumpkin-shaped fruits have pale red-orange skin and dry, sweet, dark orange flesh. Great for stuffing, soups and pies.' ], u'lifecycle': [u'Tender Annual'], u'name': [u'Gold Nugget'], u'price': [u'3.49'], u'species': [u'Cucurbita maxima'], 'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS14165', u'weight': [None] }) self.assertEqual(len(link_regions), 1) self.assertEqual( len( list(spider.plugins['Annotations']._process_link_regions( target1, link_regions))), 25)
def htmlpage_from_response(response, _add_tagids=False): body = response.body_as_unicode() if _add_tagids: body = add_tagids(body) return HtmlPage(response.url, response.headers, body, encoding=response.encoding)
def _load_templates(self): if not os.path.exists(self.filename): return [] with open(self.filename) as f: templates = json.load(f)['templates'] templates = [HtmlPage(t['url'], body=t['body'], encoding=t['encoding']) \ for t in templates] return templates
def handle_html(self, response): htmlpage = HtmlPage(response.url, response.headers, \ response.body_as_unicode(), encoding=response.encoding) items, link_regions = self.extract_items(htmlpage) for item in items: yield item for request in self._process_link_regions(htmlpage, link_regions): yield request
def test_get_base_url_empty_basehref(self): """Base tag exists but href is empty""" html = u'<html><head><base href="" />\ <body></body></html>' url = "http://example.com/products/p19.html" page = HtmlPage(url, body=html) self.assertEqual(get_base_url(page), url)
def test_not_standard_chars_in_url(self): body = u'<html><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" /></html>' url = u'fotos/produtos/Mam\xe3e noel.jpg' htmlpage = HtmlPage(url=u"http://www.example.com/", body=body, encoding='cp1252') processor = UrlFieldTypeProcessor() self.assertEqual( processor.adapt(url, htmlpage), u'http://www.example.com/fotos/produtos/Mam%C3%A3e%20noel.jpg')
def _process_link_regions(self, htmlpage, link_regions): """Process link regions if any, and generate requests""" if link_regions: for link_region in link_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, \ link_region, encoding=htmlpage.encoding) for request in self._requests_to_follow(htmlregion): yield request else: for request in self._requests_to_follow(htmlpage): yield request
def test_extraction(self): samples_encoding = 'latin1' [(html1, data1), (html2, data2)] = list(iter_samples( 'scraper_loadstore', html_encoding=samples_encoding)) sc = Scraper() page1 = HtmlPage(body=html1, encoding=samples_encoding) sc.train_from_htmlpage(page1, data1) page2 = HtmlPage(body=html2, encoding=samples_encoding) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2) # check still works after serialize/deserialize f = StringIO() sc.tofile(f) f.seek(0) sc = Scraper.fromfile(f) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2)
def load_templates(fn, site_id): try: with open(fn, 'r') as f: obj = json.load(f) except IOError: return list() tmpl_ids = obj['sites'].get(unicode(site_id)) if not tmpl_ids: return list() return list( HtmlPage(**x) for x in obj['templates'] if x['page_id'] in tmpl_ids)
def test_variants(self): """Ensure variants are extracted as list of dicts""" name = "networkhealth.com" spider = self.smanager.create(name) spec = self.smanager._specs["spiders"][name] template, = spec["templates"] target = HtmlPage(url=template["url"], body=template["original_body"]) items, link_regions = spider.plugins['Annotations'].extract_items(target) for item in items: for variant in item["variants"]: self.assertEqual(type(variant), dict)
def remove_tagids(source): """remove from the given page, all tagids previously added by add_tagids() """ output = [] if not isinstance(source, HtmlPage): source = HtmlPage(body=source) for element in source.parsed_body: if _must_add_tagid(element): element.attributes.pop(TAGID, None) output.append(serialize_tag(element)) else: output.append(source.body[element.start:element.end]) return ''.join(output)
def test_uri_stripped_of_whitespace_and_quote_characters_correctly(self): urls = [u' image.jpg ', u" '/data.jpg'", u'\n\t"file.jpg"\n\t\t'] results = ['http://www.example.com/images/image.jpg', 'http://www.example.com/data.jpg', 'http://www.example.com/images/file.jpg'] htmlpage = HtmlPage(url=u"http://www.example.com/images/", body=u'<html><body></body></html>', encoding='utf-8') url_p = UrlFieldTypeProcessor() img_p = ImagesFieldTypeProcessor() for text, url in zip(urls, results): self.assertEqual(img_p.adapt(img_p.extract(text), htmlpage), url) self.assertEqual(url_p.adapt(url_p.extract(text), htmlpage), url)
def test_uri_with_illegal_html_entities(self): urls = [u'	
 image.jpg ', u" '/�data.jpg'", u'\n\t"file.jpg"\n\t\t'] results = ['http://www.example.com/images/image.jpg', 'http://www.example.com/data.jpg', 'http://www.example.com/images/file.jpg'] htmlpage = HtmlPage(url=u"http://www.example.com/images/", body=u'<html><body></body></html>', encoding='utf-8') url_p = UrlFieldTypeProcessor() img_p = ImagesFieldTypeProcessor() for text, url in zip(urls, results): self.assertEqual(img_p.adapt(img_p.extract(text), htmlpage), url) self.assertEqual(url_p.adapt(url_p.extract(text), htmlpage), url)
def _get_cleansing(target_html, annotations): """ Gets relevant pieces of text affected by browser cleansing. """ numbered_html = add_tagids(target_html) target = HtmlPage(body=numbered_html) element = target.parsed_body[0] all_cleansing = {} for annotation in annotations: if isinstance(annotation, list): # partial annotation # search insert point we are interested on target_it = iter(target.parsed_body) for p in annotation: if isinstance(p, HtmlTag) and "insert-after" in p.attributes: insert_after = p.attributes["insert-after"] break while not (isinstance(element, HtmlTag) and element.attributes.get(TAGID) == insert_after): element = target_it.next() # 1. browser removes tags inside <option>...</option> # 2. browser adds </option> if it is not present if element.tag == "option" and \ element.tag_type == HtmlTagType.OPEN_TAG: cached = [] add_cached = False closed_option = False element = target_it.next() while not (isinstance(element, HtmlTag) and element.tag in ["option", "select"]): cached.append(element) if hasattr(element, 'tag'): add_cached = True element = target_it.next() if (element.tag == "option" and element.tag_type == HtmlTagType.OPEN_TAG) or \ (element.tag == "select" and element.tag_type == HtmlTagType.CLOSE_TAG): closed_option = True if add_cached or closed_option: out = "".join([numbered_html[e.start:e.end] for e in cached]) all_cleansing[insert_after] = out return all_cleansing
def _requests_to_follow(self, htmlpage): if self._links_ibl_extractor is not None: extracted = self._links_ibl_extractor.extract(htmlpage)[0] if extracted: extracted_regions = extracted[0].get('_links', []) seen = set() for region in extracted_regions: htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding) for request in self._request_to_follow_from_region(htmlregion): if request.url in seen: continue seen.add(request.url) yield request else: for request in self._request_to_follow_from_region(htmlpage): yield request
def test_site_pages(self): """ Tests from real pages. More reliable and easy to build for more complicated structures """ for source, annotations in iter_samples('pageparsing'): template = HtmlPage(body=source) parser = TemplatePageParser(TokenDict()) parser.feed(template) for annotation in parser.annotations: test_annotation = annotations.pop(0) for s in annotation.__slots__: if s == "tag_attributes": for pair in getattr(annotation, s): self.assertEqual(list(pair), test_annotation[s].pop(0)) else: self.assertEqual(getattr(annotation, s), test_annotation[s]) self.assertEqual(annotations, [])
def _process_link_regions(self, htmlpage, link_regions): """Process link regions if any, and generate requests""" if link_regions: for link_region in link_regions: #if isinstance(link_region, six.string_types) : # link_region = link_region.decode(htmlpage.encoding) htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, link_region, encoding=htmlpage.encoding) for request in self._requests_to_follow(htmlregion): yield request else: for request in self._requests_to_follow(htmlpage): yield request
def add_tagids(source): """ Applies a unique attribute code number for each tag element in order to be identified later in the process of apply annotation""" output = [] tagcount = 0 if not isinstance(source, HtmlPage): source = HtmlPage(body=source) for element in source.parsed_body: if _must_add_tagid(element): element.attributes[TAGID] = str(tagcount) tagcount += 1 output.append(serialize_tag(element)) else: output.append(source.body[element.start:element.end]) return ''.join(output)
def _modify_tagids(source, add=True): """Add or remove tags ids to/from HTML document""" output = [] tagcount = 0 if not isinstance(source, HtmlPage): source = HtmlPage(body=source) for element in source.parsed_body: if _must_add_tagid(element): if add: element.attributes[TAGID] = str(tagcount) tagcount += 1 else: # Remove previously added tagid element.attributes.pop(TAGID, None) output.append(serialize_tag(element)) else: output.append(source.body[element.start:element.end]) return u''.join(output)
def spider_opened(self, spider): try: clustering = spider.plugins['Annotations'].clustering assert bool(clustering) == True self.clustering_enabled = True except (KeyError, AttributeError, AssertionError): logging.warning('Persistent page clustering has not been enabled ' 'because page clustering is not enabled for this ' 'spider') return if not os.path.exists(self.directory): os.makedirs(self.directory) dbpath = os.path.join(self.directory, spider.name) flag = 'n' if self.reset else 'c' self.db = self.dbmodule.open(dbpath, flag=flag) for data in getattr(self.db, 'itervalues', self.db.values)(): page, encoding = json.loads(data) clustering.add_page(HtmlPage(body=page.decode(encoding)))
def test_empty_subregion(self): htmlpage = HtmlPage(body=u"") self.assertEqual(htmlpage.subregion(), u"")