Пример #1
0
 def test_xmlxpathselector(self):
     with warnings.catch_warnings(record=True):
         xs = XmlXPathSelector(text=self.text)
         self.assertEqual(
             xs.select("//div").extract(),
             [u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
         self.assertRaises(RuntimeError, xs.css, 'div')
Пример #2
0
 def test_xmlxpathselector(self):
     with warnings.catch_warnings():
         warnings.simplefilter('ignore', ScrapyDeprecationWarning)
         xs = XmlXPathSelector(text=self.text)
         self.assertEqual(xs.select("//div").extract(),
                          [u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
         self.assertRaises(RuntimeError, xs.css, 'div')
Пример #3
0
def commons_speech_saver(path, _writer=None):
    writer = _writer if _writer else globals()['_writer']

    try:
        with open(path) as file:
            # TODO: create my own lxml wrapper with convenience methods
            hxs = XmlXPathSelector(text=unicode(file.read(), errors="ignore"))
            count = 0
            for speech in hxs.select(r'//speech'):
                writer.save({
                    "id": hxs_extract(speech, r'./@id'),
                    "house": "commons",
                    "speakerid": hxs_extract(speech, "./@speakerid"),
                    "speakername": hxs_extract(speech, "./@speakername"),
                    "column": hxs_extract(speech, "./@column"),
                    "date": datetime.strptime(RE_DATE.search(path).group(0), "%Y-%m-%d"),
                    "time": hxs_extract(speech, "./@time"),
                    "url": hxs_extract(speech, "./@url"),
                    "text": speech.select("./*").extract()
                })
                count += 1
            return os.path.basename(path), count
    except KeyboardInterrupt:
        log.warning("Caught exception in %s, sending stop" % os.getpid())
        raise StopIteration()
Пример #4
0
 def test_xmlxpathselector(self):
     with warnings.catch_warnings():
         warnings.simplefilter('ignore', ScrapyDeprecationWarning)
         xs = XmlXPathSelector(text=self.text)
         self.assertEqual(
             xs.select("//div").extract(),
             [u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
         self.assertRaises(RuntimeError, xs.css, 'div')
Пример #5
0
    def test_remove_attributes_namespaces(self):
        xml = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
  <link atom:type="text/html">
  <link atom:type="application/atom+xml">
</feed>
"""
        xxs = XmlXPathSelector(XmlResponse("http://example.com/feed.atom", body=xml))
        self.assertEqual(len(xxs.select("//link/@type")), 0)
        xxs.remove_namespaces()
        self.assertEqual(len(xxs.select("//link/@type")), 2)
Пример #6
0
def commons_speech_saver(path, _writer=None):
    """
    Parse a hansard archive file and save it to the backing store
    """
    writer = _writer or globals()['_writer']
    rate = globals()['rate']

    try:
        count = 0
        try:
            fix_bad_zipfile(path)
            with zipfile.ZipFile(path) as zip_file:
                with zip_file.open(os.path.basename(path).replace(".zip", ".xml")) as inner_file:
                    text = unicode(inner_file.read(), errors="ignore")
                    hxs = XmlXPathSelector(text=text)
                    for housecommons in hxs.select(r'//housecommons'):
                        if random.random() < rate:
                            date_str = housecommons.select(r'.//date/@format').extract()[0]
                            speech_date = datetime.datetime.strptime(date_str.strip(), "%Y-%m-%d")

                            for speech in housecommons.select(r'.//p'):
                                writer.save({
                                    "id": "hansardarchives/%s" % hxs_extract(speech, r'./@id'),
                                    "house": "commons",
                                    "source": "hansardarchives",
                                    "speakerid": None,
                                    "speakername": hxs_extract(speech, r'./member/text()'),
                                    "column": None,
                                    "date": speech_date,
                                    "time": None,
                                    "url": None,
                                    "text": [hxs_extract(speech, r'./membercontribution/text()')]
                                })
                                count += 1
        except zipfile.BadZipfile:
            log.debug("Bad zip file %s" % os.path.basename(path))
        return os.path.basename(path), count
    except KeyboardInterrupt:
        log.warning("Caught exception in %s, sending stop" % os.getpid())
        raise StopIteration()
Пример #7
0
    def test_remove_namespaces(self):
        xml = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
  <link type="text/html">
  <link type="application/atom+xml">
</feed>
"""
        xxs = XmlXPathSelector(
            XmlResponse("http://example.com/feed.atom", body=xml))
        self.assertEqual(len(xxs.select("//link")), 0)
        xxs.remove_namespaces()
        self.assertEqual(len(xxs.select("//link")), 2)
Пример #8
0
 def test_xmlxpathselector(self):
     with warnings.catch_warnings(record=True):
         xs = XmlXPathSelector(text=self.text)
         self.assertEqual(xs.select("//div").extract(),
                          [u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
         self.assertRaises(RuntimeError, xs.css, 'div')