def test_xmlxpathselector(self): with warnings.catch_warnings(record=True): xs = XmlXPathSelector(text=self.text) self.assertEqual( xs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></img></div>']) self.assertRaises(RuntimeError, xs.css, 'div')
def test_xmlxpathselector(self): with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) xs = XmlXPathSelector(text=self.text) self.assertEqual(xs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></img></div>']) self.assertRaises(RuntimeError, xs.css, 'div')
def commons_speech_saver(path, _writer=None): writer = _writer if _writer else globals()['_writer'] try: with open(path) as file: # TODO: create my own lxml wrapper with convenience methods hxs = XmlXPathSelector(text=unicode(file.read(), errors="ignore")) count = 0 for speech in hxs.select(r'//speech'): writer.save({ "id": hxs_extract(speech, r'./@id'), "house": "commons", "speakerid": hxs_extract(speech, "./@speakerid"), "speakername": hxs_extract(speech, "./@speakername"), "column": hxs_extract(speech, "./@column"), "date": datetime.strptime(RE_DATE.search(path).group(0), "%Y-%m-%d"), "time": hxs_extract(speech, "./@time"), "url": hxs_extract(speech, "./@url"), "text": speech.select("./*").extract() }) count += 1 return os.path.basename(path), count except KeyboardInterrupt: log.warning("Caught exception in %s, sending stop" % os.getpid()) raise StopIteration()
def test_xmlxpathselector(self): with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) xs = XmlXPathSelector(text=self.text) self.assertEqual( xs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></img></div>']) self.assertRaises(RuntimeError, xs.css, 'div')
def test_remove_attributes_namespaces(self): xml = """<?xml version="1.0" encoding="UTF-8"?> <feed xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/"> <link atom:type="text/html"> <link atom:type="application/atom+xml"> </feed> """ xxs = XmlXPathSelector(XmlResponse("http://example.com/feed.atom", body=xml)) self.assertEqual(len(xxs.select("//link/@type")), 0) xxs.remove_namespaces() self.assertEqual(len(xxs.select("//link/@type")), 2)
def commons_speech_saver(path, _writer=None): """ Parse a hansard archive file and save it to the backing store """ writer = _writer or globals()['_writer'] rate = globals()['rate'] try: count = 0 try: fix_bad_zipfile(path) with zipfile.ZipFile(path) as zip_file: with zip_file.open(os.path.basename(path).replace(".zip", ".xml")) as inner_file: text = unicode(inner_file.read(), errors="ignore") hxs = XmlXPathSelector(text=text) for housecommons in hxs.select(r'//housecommons'): if random.random() < rate: date_str = housecommons.select(r'.//date/@format').extract()[0] speech_date = datetime.datetime.strptime(date_str.strip(), "%Y-%m-%d") for speech in housecommons.select(r'.//p'): writer.save({ "id": "hansardarchives/%s" % hxs_extract(speech, r'./@id'), "house": "commons", "source": "hansardarchives", "speakerid": None, "speakername": hxs_extract(speech, r'./member/text()'), "column": None, "date": speech_date, "time": None, "url": None, "text": [hxs_extract(speech, r'./membercontribution/text()')] }) count += 1 except zipfile.BadZipfile: log.debug("Bad zip file %s" % os.path.basename(path)) return os.path.basename(path), count except KeyboardInterrupt: log.warning("Caught exception in %s, sending stop" % os.getpid()) raise StopIteration()
def test_remove_namespaces(self): xml = """<?xml version="1.0" encoding="UTF-8"?> <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/"> <link type="text/html"> <link type="application/atom+xml"> </feed> """ xxs = XmlXPathSelector( XmlResponse("http://example.com/feed.atom", body=xml)) self.assertEqual(len(xxs.select("//link")), 0) xxs.remove_namespaces() self.assertEqual(len(xxs.select("//link")), 2)
def test_xmlxpathselector(self): with warnings.catch_warnings(record=True): xs = XmlXPathSelector(text=self.text) self.assertEqual(xs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></img></div>']) self.assertRaises(RuntimeError, xs.css, 'div')