def test_xmliter_unicode(self): # example taken from https://github.com/scrapy/scrapy/issues/1665 body = u"""<?xml version="1.0" encoding="UTF-8"?> <þingflokkar> <þingflokkur id="26"> <heiti /> <skammstafanir> <stuttskammstöfun>-</stuttskammstöfun> <löngskammstöfun /> </skammstafanir> <tímabil> <fyrstaþing>80</fyrstaþing> </tímabil> </þingflokkur> <þingflokkur id="21"> <heiti>Alþýðubandalag</heiti> <skammstafanir> <stuttskammstöfun>Ab</stuttskammstöfun> <löngskammstöfun>Alþb.</löngskammstöfun> </skammstafanir> <tímabil> <fyrstaþing>76</fyrstaþing> <síðastaþing>123</síðastaþing> </tímabil> </þingflokkur> <þingflokkur id="27"> <heiti>Alþýðuflokkur</heiti> <skammstafanir> <stuttskammstöfun>A</stuttskammstöfun> <löngskammstöfun>Alþfl.</löngskammstöfun> </skammstafanir> <tímabil> <fyrstaþing>27</fyrstaþing> <síðastaþing>120</síðastaþing> </tímabil> </þingflokkur> </þingflokkar>""" for r in ( # with bytes XmlResponse(url="http://example.com", body=body.encode('utf-8')), # Unicode body needs encoding information XmlResponse(url="http://example.com", body=body, encoding='utf-8'), ): attrs = [] for x in self.xmliter(r, u'þingflokkur'): attrs.append( (x.attrib['id'], x.xpath( u'./skammstafanir/stuttskammstöfun/text()').getall(), x.xpath(u'./tímabil/fyrstaþing/text()').getall())) self.assertEqual(attrs, [(u'26', [u'-'], [u'80']), (u'21', [u'Ab'], [u'76']), (u'27', [u'A'], [u'27'])])
def parse_field(self, html, fn): response = XmlResponse('http://localhost/test.html', body='<book><row>%s</row></book>' % html) row = response.css('row')[0] node = response.css('entry')[0] declaration = Loader(self.spider, response, LobbyistDeclaration(), row) declaration.add_value(None, fn(node)) item = declaration.load_item() actual = dict(item) return actual
def test_parse_declaration_xml_4_columns(self): # this format was used for 2012 and 2013 declarations response = XmlResponse('http://old.vtek.lt/vtek/.../deklaracija2012.doc', body=fixture('lobist_veiklos_atatskaita_2012.doc.xml')) response.request = scrapy.Request(response.url) response.request.meta['year'] = '2012' items = list(self.spider.parse_declaration_xml(response)) self.assertEqual(len(items), 30) self.assertEqual(items[0]['name'], 'ROMAS STUMBRYS') self.assertEqual(items[0]['comments'], u'Lobistinės veiklos nevykdė') self.assertEqual(items[0]['year'], '2012') self.assertEqual(items[0]['source_url'], response.url)
def test_parse_declaration_xml_4_columns(self): # this format was used for 2012 and 2013 declarations response = XmlResponse( 'http://old.vtek.lt/vtek/.../deklaracija2012.doc', body=fixture('lobist_veiklos_atatskaita_2012.doc.xml')) response.request = scrapy.Request(response.url) response.request.meta['year'] = '2012' items = list(self.spider.parse_declaration_xml(response)) self.assertEqual(len(items), 30) self.assertEqual(items[0]['name'], 'ROMAS STUMBRYS') self.assertEqual(items[0]['comments'], u'Lobistinės veiklos nevykdė') self.assertEqual(items[0]['year'], '2012') self.assertEqual(items[0]['source_url'], response.url)
def test_parse_declaration_xml_5_columns(self): # this format was used for 2014 declarations response = XmlResponse('http://old.vtek.lt/vtek/.../deklaracija2014.doc', body=fixture('Info_apie_lobistu_ataskaitas_2014_2015_04_08.doc.xml')) response.request = scrapy.Request(response.url) response.request.meta['year'] = '2014' items = list(self.spider.parse_declaration_xml(response)) self.assertEqual(len(items), 34) self.assertEqual(items[0]['name'], 'ROMAS STUMBRYS') self.assertEqual(items[0]['year'], '2014') self.assertEqual(items[0]['source_url'], response.url) self.assertEqual(items[-1]['name'], u'UAB INLINEN') self.assertEqual(items[-1]['comments'], u'Lobistinės veiklos nevykdė')
def test_xmliter_namespaces(self): body = b"""\ <?xml version="1.0" encoding="UTF-8"?> <rss version="2.0" xmlns:g="http://base.google.com/ns/1.0"> <channel> <title>My Dummy Company</title> <link>http://www.mydummycompany.com</link> <description>This is a dummy company. We do nothing.</description> <item> <title>Item 1</title> <description>This is item 1</description> <link>http://www.mydummycompany.com/items/1</link> <g:image_link>http://www.mydummycompany.com/images/item1.jpg</g:image_link> <g:id>ITEM_1</g:id> <g:price>400</g:price> </item> </channel> </rss> """ response = XmlResponse(url='http://mydummycompany.com', body=body) my_iter = self.xmliter(response, 'item') node = next(my_iter) node.register_namespace('g', 'http://base.google.com/ns/1.0') self.assertEqual(node.xpath('title/text()').getall(), ['Item 1']) self.assertEqual(node.xpath('description/text()').getall(), ['This is item 1']) self.assertEqual(node.xpath('link/text()').getall(), ['http://www.mydummycompany.com/items/1']) self.assertEqual(node.xpath('g:image_link/text()').getall(), ['http://www.mydummycompany.com/images/item1.jpg']) self.assertEqual(node.xpath('g:id/text()').getall(), ['ITEM_1']) self.assertEqual(node.xpath('g:price/text()').getall(), ['400']) self.assertEqual(node.xpath('image_link/text()').getall(), []) self.assertEqual(node.xpath('id/text()').getall(), []) self.assertEqual(node.xpath('price/text()').getall(), [])
def test_xmliter_namespaces_prefix(self): body = b"""\ <?xml version="1.0" encoding="UTF-8"?> <root> <h:table xmlns:h="http://www.w3.org/TR/html4/"> <h:tr> <h:td>Apples</h:td> <h:td>Bananas</h:td> </h:tr> </h:table> <f:table xmlns:f="http://www.w3schools.com/furniture"> <f:name>African Coffee Table</f:name> <f:width>80</f:width> <f:length>120</f:length> </f:table> </root> """ response = XmlResponse(url='http://mydummycompany.com', body=body) my_iter = self.xmliter(response, 'table', 'http://www.w3.org/TR/html4/', 'h') node = next(my_iter) self.assertEqual(len(node.xpath('h:tr/h:td').getall()), 2) self.assertEqual(node.xpath('h:tr/h:td[1]/text()').getall(), ['Apples']) self.assertEqual(node.xpath('h:tr/h:td[2]/text()').getall(), ['Bananas']) my_iter = self.xmliter(response, 'table', 'http://www.w3schools.com/furniture', 'f') node = next(my_iter) self.assertEqual(node.xpath('f:name/text()').getall(), ['African Coffee Table'])
def test_selector_namespaces_multiple(self): body = """<?xml version="1.0" encoding="UTF-8"?> <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05" xmlns:b="http://somens.com" xmlns:p="http://www.scrapy.org/product" > <b:Operation>hello</b:Operation> <TestTag b:att="value"><Other>value</Other></TestTag> <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag> </BrowseNode> """ response = XmlResponse(url="http://example.com", body=body) x = self.xxs_cls(response) x.register_namespace( "xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05") x.register_namespace("p", "http://www.scrapy.org/product") x.register_namespace("b", "http://somens.com") self.assertEqual(len(x.select("//xmlns:TestTag")), 1) self.assertEqual( x.select("//b:Operation/text()").extract()[0], 'hello') self.assertEqual( x.select("//xmlns:TestTag/@b:att").extract()[0], 'value') self.assertEqual( x.select("//p:SecondTestTag/xmlns:price/text()").extract()[0], '90') self.assertEqual( x.select("//p:SecondTestTag").select("./xmlns:price/text()") [0].extract(), '90') self.assertEqual( x.select("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron')
def test_invalid_xpath_unicode(self): "Test *Unicode* invalid xpath raises ValueError with the invalid xpath" response = XmlResponse(url="http://example.com", body="<html></html>") x = self.sscls(response) xpath = u"//test[@foo='\u0431ar]" encoded = xpath if six.PY3 else xpath.encode('unicode_escape') self.assertRaisesRegexp(ValueError, re.escape(encoded), x.xpath, xpath)
def test_xmliter_encoding(self): body = b'<?xml version="1.0" encoding="ISO-8859-9"?>\n<xml>\n <item>Some Turkish Characters \xd6\xc7\xde\xdd\xd0\xdc \xfc\xf0\xfd\xfe\xe7\xf6</item>\n</xml>\n\n' response = XmlResponse('http://www.example.com', body=body) self.assertEqual( next(self.xmliter(response, 'item')).get(), u'<item>Some Turkish Characters \xd6\xc7\u015e\u0130\u011e\xdc \xfc\u011f\u0131\u015f\xe7\xf6</item>' )
def test_parse_declaration_xml_5_columns(self): # this format was used for 2014 declarations response = XmlResponse( 'http://old.vtek.lt/vtek/.../deklaracija2014.doc', body=fixture( 'Info_apie_lobistu_ataskaitas_2014_2015_04_08.doc.xml')) response.request = scrapy.Request(response.url) response.request.meta['year'] = '2014' items = list(self.spider.parse_declaration_xml(response)) self.assertEqual(len(items), 34) self.assertEqual(items[0]['name'], 'ROMAS STUMBRYS') self.assertEqual(items[0]['year'], '2014') self.assertEqual(items[0]['source_url'], response.url) self.assertEqual(items[-1]['name'], u'UAB INLINEN') self.assertEqual(items[-1]['comments'], u'Lobistinės veiklos nevykdė')
def test_xmliter(self): body = b""" <?xml version="1.0" encoding="UTF-8"?> <products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd"> <product id="001"> <type>Type 1</type> <name>Name 1</name> </product> <product id="002"> <type>Type 2</type> <name>Name 2</name> </product> </products> """ response = XmlResponse(url="http://example.com", body=body) attrs = [] for x in self.xmliter(response, "product"): attrs.append(( x.attrib["id"], x.xpath("name/text()").getall(), x.xpath("./type/text()").getall(), )) self.assertEqual(attrs, [("001", ["Name 1"], ["Type 1"]), ("002", ["Name 2"], ["Type 2"])])
def test_xhtml(self): xhtml = b""" <?xml version="1.0"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>XHTML document title</title> </head> <body> <div class='links'> <p><a href="/about.html">About us</a></p> </div> <div> <p><a href="/follow.html">Follow this link</a></p> </div> <div> <p><a href="/nofollow.html" rel="nofollow">Dont follow this one</a></p> </div> <div> <p><a href="/nofollow2.html" rel="blah">Choose to follow or not</a></p> </div> <div> <p><a href="http://google.com/something" rel="external nofollow">External link not to follow</a></p> </div> </body> </html> """ response = HtmlResponse("http://example.com/index.xhtml", body=xhtml) lx = self.extractor_cls() self.assertEqual( lx.extract_links(response), [ Link(url='http://example.com/about.html', text='About us', fragment='', nofollow=False), Link(url='http://example.com/follow.html', text='Follow this link', fragment='', nofollow=False), Link(url='http://example.com/nofollow.html', text='Dont follow this one', fragment='', nofollow=True), Link(url='http://example.com/nofollow2.html', text='Choose to follow or not', fragment='', nofollow=False), Link(url='http://google.com/something', text='External link not to follow', nofollow=True), ] ) response = XmlResponse("http://example.com/index.xhtml", body=xhtml) lx = self.extractor_cls() self.assertEqual( lx.extract_links(response), [ Link(url='http://example.com/about.html', text='About us', fragment='', nofollow=False), Link(url='http://example.com/follow.html', text='Follow this link', fragment='', nofollow=False), Link(url='http://example.com/nofollow.html', text='Dont follow this one', fragment='', nofollow=True), Link(url='http://example.com/nofollow2.html', text='Choose to follow or not', fragment='', nofollow=False), Link(url='http://google.com/something', text='External link not to follow', nofollow=True), ] )
def test_spider_parse_cities(): response = XmlResponse('https://example.com/example/', body=Path(FIXTURES_DIR / 'feed_cities.xml').read_bytes()) job = next(startupjobs.Spider().parse(response)) assert job['locations_raw'] == ['Praha, Česko', 'Olomouc, Česko']
def test_spider_parse_job_types(): response = XmlResponse('https://example.com/example/', body=Path(FIXTURES_DIR / 'feed_job_types.xml').read_bytes()) job = next(startupjobs.Spider().parse(response)) assert job['employment_types'] == ['Full-time', 'External collaboration']
def test_xmliter_iterate_namespace(self): body = """\ <?xml version="1.0" encoding="UTF-8"?> <rss version="2.0" xmlns="http://base.google.com/ns/1.0"> <channel> <title>My Dummy Company</title> <link>http://www.mydummycompany.com</link> <description>This is a dummy company. We do nothing.</description> <item> <title>Item 1</title> <description>This is item 1</description> <link>http://www.mydummycompany.com/items/1</link> <image_link>http://www.mydummycompany.com/images/item1.jpg</image_link> <image_link>http://www.mydummycompany.com/images/item2.jpg</image_link> </item> </channel> </rss> """ response = XmlResponse(url='http://mydummycompany.com', body=body) no_namespace_iter = self.xmliter(response, 'image_link') self.assertEqual(len(list(no_namespace_iter)), 0) namespace_iter = self.xmliter(response, 'image_link', 'http://base.google.com/ns/1.0') node = namespace_iter.next() self.assertEqual( node.select('text()').extract(), ['http://www.mydummycompany.com/images/item1.jpg']) node = namespace_iter.next() self.assertEqual( node.select('text()').extract(), ['http://www.mydummycompany.com/images/item2.jpg'])
def test_xmliter_iterate_namespace(self): body = b""" <?xml version="1.0" encoding="UTF-8"?> <rss version="2.0" xmlns="http://base.google.com/ns/1.0"> <channel> <title>My Dummy Company</title> <link>http://www.mydummycompany.com</link> <description>This is a dummy company. We do nothing.</description> <item> <title>Item 1</title> <description>This is item 1</description> <link>http://www.mydummycompany.com/items/1</link> <image_link>http://www.mydummycompany.com/images/item1.jpg</image_link> <image_link>http://www.mydummycompany.com/images/item2.jpg</image_link> </item> </channel> </rss> """ response = XmlResponse(url="http://mydummycompany.com", body=body) no_namespace_iter = self.xmliter(response, "image_link") self.assertEqual(len(list(no_namespace_iter)), 0) namespace_iter = self.xmliter(response, "image_link", "http://base.google.com/ns/1.0") node = next(namespace_iter) self.assertEqual( node.xpath("text()").getall(), ["http://www.mydummycompany.com/images/item1.jpg"], ) node = next(namespace_iter) self.assertEqual( node.xpath("text()").getall(), ["http://www.mydummycompany.com/images/item2.jpg"], )
def test_spider_parse_html_entities(): response = XmlResponse('https://example.com/example/', body=Path(FIXTURES_DIR / 'feed_html_entities.xml').read_bytes()) job = next(startupjobs.Spider().parse(response)) assert job['title'] == 'Analytik&programátor Junior' assert job['company_name'] == 'P&J Capital'
def test_selector_invalid_xpath(self): response = XmlResponse(url="http://example.com", body="<html></html>") x = self.hxs_cls(response) xpath = "//test[@foo='bar]" try: x.select(xpath) except ValueError, e: assert xpath in str(e), "Exception message does not contain invalid xpath"
def test_get_sitemap_body(self): r = XmlResponse(url="http://www.example.com/", body=self.BODY) self.assertSitemapBody(r, self.BODY) r = HtmlResponse(url="http://www.example.com/", body=self.BODY) self.assertSitemapBody(r, None) r = Response(url="http://www.example.com/favicon.ico", body=self.BODY) self.assertSitemapBody(r, None)
def test_spider_parse_cities_job_objects_are_copies(): response = XmlResponse('https://example.com/example/', body=Path(FIXTURES_DIR / 'feed_cities.xml').read_bytes()) jobs = list(startupjobs.Spider().parse(response)) jobs[0]['title'] = 'Modified' assert jobs[0]['link'] == jobs[1]['link'] assert jobs[0]['title'] == 'Modified' assert jobs[1]['title'] == 'Server / Cloud / DevOps Admin'
def test_parse_xml_report(self): '''Parse XML 10-Q or 10-K report.''' spider = EdgarSpider() spider._follow_links = True # HACK body = ''' <?xml version="1.0"> <xbrl xmlns="http://www.xbrl.org/2003/instance" xmlns:xbrli="http://www.xbrl.org/2003/instance" xmlns:dei="http://xbrl.sec.gov/dei/2011-01-31" xmlns:us-gaap="http://fasb.org/us-gaap/2011-01-31"> <context id="c1"> <startDate>2013-03-31</startDate> <endDate>2013-06-28</endDate> </context> <dei:AmendmentFlag contextRef="c1">false</dei:AmendmentFlag> <dei:DocumentType contextRef="c1">10-Q</dei:DocumentType> <dei:DocumentFiscalPeriodFocus contextRef="c1">Q2</dei:DocumentFiscalPeriodFocus> <dei:DocumentPeriodEndDate contextRef="c1">2013-06-28</dei:DocumentPeriodEndDate> <dei:DocumentFiscalYearFocus>2013</dei> <us-gaap:Revenues contextRef="c1">100</us-gaap:Revenues> <us-gaap:NetIncomeLoss contextRef="c1">200</us-gaap:NetIncomeLoss> <us-gaap:EarningsPerShareBasic contextRef="c1">0.2</us-gaap:EarningsPerShareBasic> <us-gaap:EarningsPerShareDiluted contextRef="c1">0.19</us-gaap:EarningsPerShareDiluted> <us-gaap:CommonStockDividendsPerShareDeclared contextRef="c1">0.07</us-gaap:CommonStockDividendsPerShareDeclared> <us-gaap:Assets contextRef="c1">1600</us-gaap:Assets> <us-gaap:StockholdersEquity contextRef="c1">300</us-gaap:StockholdersEquity> <us-gaap:CashAndCashEquivalentsAtCarryingValue contextRef="c1">150</us-gaap:CashAndCashEquivalentsAtCarryingValue> </xbrl> ''' response = XmlResponse( 'http://sec.gov/Archives/edgar/data/123/abc-20130720.xml', body=body) item = spider.parse_10qk(response) self.assert_item( item, { 'symbol': 'ABC', 'amend': False, 'doc_type': '10-Q', 'period_focus': 'Q2', 'fiscal_year': 2013, 'end_date': '2013-06-28', 'revenues': 100.0, 'net_income': 200.0, 'eps_basic': 0.2, 'eps_diluted': 0.19, 'dividend': 0.07, 'assets': 1600.0, 'equity': 300.0, 'cash': 150.0 })
def test_xml_entity_expansion(self): malicious_xml = '<?xml version="1.0" encoding="ISO-8859-1"?>'\ '<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM '\ '"file:///etc/passwd" >]><foo>&xxe;</foo>' response = XmlResponse('http://example.com', body=malicious_xml) sel = self.sscls(response=response) self.assertEqual(sel.extract(), '<foo>&xxe;</foo>')
def test_xmliter_unusual_node(self): body = b"""<?xml version="1.0" encoding="UTF-8"?> <root> <matchme...></matchme...> <matchmenot></matchmenot> </root> """ response = XmlResponse(url="http://example.com", body=body) nodenames = [e.xpath('name()').getall() for e in self.xmliter(response, 'matchme...')] self.assertEqual(nodenames, [['matchme...']])
def test_flavor_detection(self): text = b'<div><img src="a.jpg"><p>Hello</div>' sel = Selector(XmlResponse('http://example.com', body=text, encoding='utf-8')) self.assertEqual(sel.type, 'xml') self.assertEqual(sel.xpath("//div").getall(), [u'<div><img src="a.jpg"><p>Hello</p></img></div>']) sel = Selector(HtmlResponse('http://example.com', body=text, encoding='utf-8')) self.assertEqual(sel.type, 'html') self.assertEqual(sel.xpath("//div").getall(), [u'<div><img src="a.jpg"><p>Hello</p></div>'])
def parse_single(self, response): sickle = Sickle(self.url) params = { 'metadataPrefix': self.format, 'identifier': response.meta['identifier'], } record = sickle.GetRecord(**params) self._crawled_records[params['identifier']] = record response = XmlResponse(self.url, encoding='utf-8', body=record.raw) selector = Selector(response, type='xml') return self.parse_record(selector)
def test_remove_attributes_namespaces(self): xml = """<?xml version="1.0" encoding="UTF-8"?> <feed xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/"> <link atom:type="text/html"> <link atom:type="application/atom+xml"> </feed> """ sel = self.sscls(XmlResponse("http://example.com/feed.atom", body=xml)) self.assertEqual(len(sel.xpath("//link/@type")), 0) sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link/@type")), 2)
def test_flavor_detection(self): text = '<div><img src="a.jpg"><p>Hello</div>' sel = self.sscls(XmlResponse('http://example.com', body=text)) self.assertEqual(sel.type, 'xml') self.assertEqual(sel.xpath("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></img></div>']) sel = self.sscls(HtmlResponse('http://example.com', body=text)) self.assertEqual(sel.type, 'html') self.assertEqual(sel.xpath("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>'])
def test_remove_namespaces(self): xml = """<?xml version="1.0" encoding="UTF-8"?> <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/"> <link type="text/html"> <link type="application/atom+xml"> </feed> """ xxs = XmlXPathSelector( XmlResponse("http://example.com/feed.atom", body=xml)) self.assertEqual(len(xxs.select("//link")), 0) xxs.remove_namespaces() self.assertEqual(len(xxs.select("//link")), 2)
def test_invalid_xpath(self): response = XmlResponse(url="http://example.com", body="<html></html>") x = self.sscls(response) xpath = "//test[@foo='bar]" try: x.xpath(xpath) except ValueError as e: assert xpath in str(e), "Exception message does not contain invalid xpath" except Exception: raise AssertionError("A invalid XPath does not raise ValueError") else: raise AssertionError("A invalid XPath does not raise an exception")
def test_selector_namespaces_simple(self): body = """ <test xmlns:somens="http://scrapy.org"> <somens:a id="foo">take this</a> <a id="bar">found</a> </test> """ response = XmlResponse(url="http://example.com", body=body) x = self.xxs_cls(response) x.register_namespace("somens", "http://scrapy.org") self.assertEqual(x.select("//somens:a/text()").extract(), [u'take this'])
def test_links_from_sitemap(self): body = open(join(_PATH, "data", "sitemap_sample.xml")).read() response = XmlResponse(url="http://example.com/sample.xml", body=body, headers={'Content-Type': "text/xml; charset=UTF-8"}) name = "sitemaps" spider = self.smanager.create(name) urls = [r.url for r in spider.parse(response)] self.assertEqual(len(urls), 3) self.assertEqual(set(urls), set([ "https://www.siliconrepublic.com/post-sitemap1.xml", "https://www.siliconrepublic.com/post-sitemap2.xml", "https://www.siliconrepublic.com/post-sitemap3.xml"]))
def parse_law_projects(self, html, fn): response = XmlResponse('http://localhost/test.html', body='<book>%s</book>' % html) node = response.css('entry')[0] return fn(node)