def test_w3c_7_1(self): body = get_testdata("w3c", "microdata.7.1.html") expected = json.loads(get_testdata("w3c", "microdata.7.1.json").decode("UTF-8")) mde = MicrodataExtractor(strict=True) data = mde.extract(body, "http://blog.example.com/progress-report") self.assertDictEqual(data, expected)
def test_w3c_5_2(self): body = get_testdata('w3c', 'microdata.5.2.html') expected = json.loads(get_testdata('w3c', 'microdata.5.2.flat.json').decode('UTF-8')) mde = MicrodataExtractor(nested=False, strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_object_element(self): body = get_testdata('w3c', 'microdata.object.html') expected = json.loads(get_testdata('w3c', 'microdata.object.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body, 'http://www.example.com/microdata/test') self.assertDictEqual(data, expected)
def test_w3c_data_element(self): body = get_testdata('w3c', 'microdata.4.2.data.html') expected = json.loads(get_testdata('w3c', 'microdata.4.2.data.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_extraction_encoding(self): body = get_testdata("link_extractor", "linkextractor_noenc.html") response_utf8 = HtmlResponse( url="http://example.com/utf8", body=body, headers={"Content-Type": ["text/html; charset=utf-8"]} ) response_noenc = HtmlResponse(url="http://example.com/noenc", body=body) body = get_testdata("link_extractor", "linkextractor_latin1.html") response_latin1 = HtmlResponse(url="http://example.com/latin1", body=body) lx = BaseSgmlLinkExtractor() self.assertEqual( lx.extract_links(response_utf8), [ Link(url="http://example.com/sample_%C3%B1.html", text=""), Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")), ], ) self.assertEqual( lx.extract_links(response_noenc), [ Link(url="http://example.com/sample_%C3%B1.html", text=""), Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")), ], ) self.assertEqual( lx.extract_links(response_latin1), [ Link(url="http://example.com/sample_%F1.html", text=""), Link(url="http://example.com/sample_%E1.html", text="sample \xe1 text".decode("latin1")), ], )
def test_w3c_5_2(self): body = get_testdata('w3c', 'microdata.5.2.html') expected = json.loads(get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8')) mde = MicrodataExtractor(add_text_content=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def test_w3c_7_1(self): body = get_testdata('w3c', 'microdata.7.1.html') expected = json.loads(get_testdata('w3c', 'microdata.7.1.json').decode('UTF-8')) mde = MicrodataExtractor(strict=True) data = mde.extract(body, 'http://blog.example.com/progress-report') self.assertDictEqual(data, expected)
def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) # document encoding does not affect URL path component, only query part # >>> u'sample_ñ.html'.encode('utf8') # b'sample_\xc3\xb1.html' # >>> u"sample_á.html".encode('utf8') # b'sample_\xc3\xa1.html' # >>> u"sample_ö.html".encode('utf8') # b'sample_\xc3\xb6.html' # >>> u"£32".encode('latin1') # b'\xa332' # >>> u"µ".encode('latin1') # b'\xb5' self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')), Link(url='http://example.com/sample_%C3%B6.html?price=%A332&%B5=unit', text=''), ])
def test_w3c_5_5(self): body = get_testdata("w3c", "microdata.5.5.html") expected = json.loads(get_testdata("w3c", "microdata.5.5.json").decode("UTF-8")) mde = MicrodataExtractor(strict=True) data = mde.extract(body) self.assertDictEqual(data, expected)
def _test_data(formats): uncompressed_body = get_testdata('compressed', 'feed-sample1.xml') test_responses = {} for format in formats: body = get_testdata('compressed', 'feed-sample1.' + format) test_responses[format] = Response('http://foo.com/bar', body=body) return uncompressed_body, test_responses
def test_schemaorg_MusicRecording(self): for i in [1]: body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) self.assertDictEqual(data, expected)
def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertDictEqual(data, expected)
def test_schemaorg_Event(self): for i in [1, 2, 3, 4, 8]: body = get_testdata("schema.org", "Event.{:03d}.html".format(i)) expected = json.loads(get_testdata("schema.org", "Event.{:03d}.json".format(i)).decode("UTF-8")) mde = MicrodataExtractor() data = mde.extract(body) self.assertDictEqual(data, expected)
def test_wikipedia_xhtml_rdfa(self): fileprefix = 'xhtml+rdfa' body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('wikipedia', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.exaple.com/index.html') self.assertJsonLDEqual(data, expected)
def test_w3c_rdf11primer(self): for i in [14]: fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i) body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.exaple.com/index.html') self.assertJsonLDEqual(data, expected)
def test_songkick(self): for page in [ "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015", #"Maxïmo Park Gigography, Tour History & Past Concerts", #"Years & Years Tickets, Tour Dates 2015 & Concerts", ]: body = get_testdata('songkick', '{}.html'.format(page)) expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertDictEqual(data, expected)
def test_csviter_encoding(self): body1 = get_testdata('feeds', 'feed-sample4.csv') body2 = get_testdata('feeds', 'feed-sample5.csv') response = TextResponse(url="http://example.com/", body=body1, encoding='latin1') csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'latin1', u'value': u'test'}, {u'id': u'2', u'name': u'something', u'value': u'\xf1\xe1\xe9\xf3'}]) response = TextResponse(url="http://example.com/", body=body2, encoding='cp852') csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'cp852', u'value': u'test'}, {u'id': u'2', u'name': u'something', u'value': u'\u255a\u2569\u2569\u2569\u2550\u2550\u2557'}])
def test_csviter_falserow(self): body = get_testdata('feeds', 'feed-sample3.csv') body = b'\n'.join((body, b'a,b', b'a,b,c,d')) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{ u'id': u'1', u'name': u'alpha', u'value': u'foobar' }, { u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d' }, { u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL }, { u'id': u'4', u'name': u'empty', u'value': u'' }])
def test_csviter_headers(self): sample = get_testdata('feeds', 'feed-sample3.csv').splitlines() headers, body = sample[0].split(','), '\n'.join(sample[1:]) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, headers=headers) self.assertEqual([row for row in csv], [{ u'id': u'1', u'name': u'alpha', u'value': u'foobar' }, { u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d' }, { u'id': u'3', u'name': u'multi', u'value': u'foo\nbar' }, { u'id': u'4', u'name': u'empty', u'value': u'' }])
def test_umicrodata(self): expected = [{ "@context": "http://schema.org", "@type": "Product", "brand": "ACME", "name": "Executive Anvil", "image": "anvil_executive.jpg", "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": { "@type": "AggregateRating", "ratingValue": "4.4", "reviewCount": "89" }, "offers": { "@type": "Offer", "priceCurrency": "USD", "price": "119.99", "priceValidUntil": "2020-11-05", "seller": { "@type": "Organization", "name": "Executive Objects" }, "itemCondition": "http://schema.org/UsedCondition", "availability": "http://schema.org/InStock" } }] body = get_testdata('misc', 'product_microdata.html') data = extruct.extract(body, syntaxes=['microdata'], uniform=True) self.assertEqual(data['microdata'], expected)
def test_uopengraph(self): expected = [{ "@context": { "og": "http://ogp.me/ns#", "fb": "http://www.facebook.com/2008/fbml", "concerts": "http://ogp.me/ns/fb/songkick-concerts#" }, "fb:app_id": "308540029359", "og:site_name": "Songkick", "@type": "songkick-concerts:artist", "og:title": "Elysian Fields", "og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.", "og:url": "http://www.songkick.com/artists/236156-elysian-fields", "og:image": "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg", }] body = get_testdata('songkick', 'elysianfields.html') data = extruct.extract(body, syntaxes=['opengraph'], uniform=True) self.assertEqual(data['opengraph'], expected)
def test_csviter_defaults(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) result = [row for row in csv] self.assertEqual(result, [{ 'id': '1', 'name': 'alpha', 'value': 'foobar' }, { 'id': '2', 'name': 'unicode', 'value': '\xfan\xedc\xf3d\xe9\u203d' }, { 'id': '3', 'name': 'multi', 'value': "foo\nbar" }, { 'id': '4', 'name': 'empty', 'value': '' }]) # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: self.assertTrue( all((isinstance(k, str) for k in result_row.keys()))) self.assertTrue( all((isinstance(v, str) for v in result_row.values())))
def test_csviter_headers(self): sample = get_testdata('feeds', 'feed-sample3.csv').splitlines() headers, body = sample[0].split(b','), b'\n'.join(sample[1:]) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, headers=[h.decode('utf-8') for h in headers]) self.assertEqual([row for row in csv], [{ 'id': '1', 'name': 'alpha', 'value': 'foobar' }, { 'id': '2', 'name': 'unicode', 'value': '\xfan\xedc\xf3d\xe9\u203d' }, { 'id': '3', 'name': 'multi', 'value': 'foo\nbar' }, { 'id': '4', 'name': 'empty', 'value': '' }])
def test_w3c_rdfaprimer(self): for i in [5, 6, 7, 8, 9, 10, 11, 15]: fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i) print(fileprefix) body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.example.com/index.html') print("extracted:\n%s" % pformat(tupleize(data))) print("expected:\n%s" % pformat(tupleize(expected))) print("extracted:\n%s" % self.prettify(data)) print("expected:\n%s" % self.prettify(expected)) self.assertJsonLDEqual(data, expected)
def test_csviter_quotechar(self): body1 = get_testdata('feeds', 'feed-sample6.csv') body2 = get_testdata('feeds', 'feed-sample6.csv').replace(b',', b'|') response1 = TextResponse(url="http://example.com/", body=body1) csv1 = csviter(response1, quotechar="'") self.assertEqual([row for row in csv1], [{ u'id': u'1', u'name': u'alpha', u'value': u'foobar' }, { u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d' }, { u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL }, { u'id': u'4', u'name': u'empty', u'value': u'' }]) response2 = TextResponse(url="http://example.com/", body=body2) csv2 = csviter(response2, delimiter="|", quotechar="'") self.assertEqual([row for row in csv2], [{ u'id': u'1', u'name': u'alpha', u'value': u'foobar' }, { u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d' }, { u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL }, { u'id': u'4', u'name': u'empty', u'value': u'' }])
def test_w3c_rdfaprimer(self): for i in [5, 6, 7, 8, 9, 10, 11, 15]: fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i) print(fileprefix) body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json').decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, url='http://www.example.com/index.html') print("extracted:\n%s" % pformat(tupleize(data))) print("expected:\n%s" % pformat(tupleize(expected))) print("extracted:\n%s" % self.prettify(data)) print("expected:\n%s" % self.prettify(expected)) self.assertJsonLDEqual(data, expected)
def test_jsonld_with_comments(self): for prefix in ['JoinAction.001', 'AllocateAction.001']: body = get_testdata('schema.org.invalid', '{}.html'.format(prefix)) name = '{}.jsonld'.format(prefix) expected = json.loads(get_testdata('schema.org.invalid', name).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected) for prefix in ['JoinAction.001', 'AllocateAction.001', ]: body = get_testdata('custom.invalid', '{}.html'.format(prefix)) expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(prefix)).decode('UTF-8')) jsonlde = JsonLdExtractor() data = jsonlde.extract(body) self.assertEqual(data, expected)
def test_udublincore(self): expected = [{ 'elements': [{ 'name': 'DC.title', 'lang': 'en', 'content': 'Expressing Dublin Core\nin HTML/XHTML meta and link elements', 'URI': 'http://purl.org/dc/elements/1.1/title' }, { 'name': 'DC.creator', 'content': 'Andy Powell, UKOLN, University of Bath', 'URI': 'http://purl.org/dc/elements/1.1/creator' }, { 'name': 'DC.identifier', 'scheme': 'DCTERMS.URI', 'content': 'http://dublincore.org/documents/dcq-html/', 'URI': 'http://purl.org/dc/elements/1.1/identifier' }, { 'name': 'DC.format', 'scheme': 'DCTERMS.IMT', 'content': 'text/html', 'URI': 'http://purl.org/dc/elements/1.1/format' }], 'terms': [{ 'name': 'DCTERMS.issued', 'scheme': 'DCTERMS.W3CDTF', 'content': '2003-11-01', 'URI': 'http://purl.org/dc/terms/issued' }, { 'name': 'DCTERMS.abstract', 'content': 'This document describes how\nqualified Dublin Core metadata can be encoded\nin HTML/XHTML <meta> elements', 'URI': 'http://purl.org/dc/terms/abstract' }, { 'name': 'DC.Date.modified', 'content': '2001-07-18', 'URI': 'http://purl.org/dc/terms/modified' }, { 'name': 'DCTERMS.modified', 'content': '2001-07-18', 'URI': 'http://purl.org/dc/terms/modified' }, { 'rel': 'DCTERMS.replaces', 'hreflang': 'en', 'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/', 'URI': 'http://purl.org/dc/terms/replaces' }], '@context': { 'DC': 'http://purl.org/dc/elements/1.1/', 'DCTERMS': 'http://purl.org/dc/terms/' }, '@type': 'Text' }] body = get_testdata('misc', 'dublincore_test.html') data = extruct.extract(body, syntaxes=['dublincore'], uniform=True) self.assertEqual(data['dublincore'], expected)
def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse( url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) # document encoding does not affect URL path component, only query part # >>> u'sample_ñ.html'.encode('utf8') # b'sample_\xc3\xb1.html' # >>> u"sample_á.html".encode('utf8') # b'sample_\xc3\xa1.html' # >>> u"sample_ö.html".encode('utf8') # b'sample_\xc3\xb6.html' # >>> u"£32".encode('latin1') # b'\xa332' # >>> u"µ".encode('latin1') # b'\xb5' self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')), Link(url= 'http://example.com/sample_%C3%B6.html?price=%A332&%B5=unit', text=''), ])
def test_csviter_delimiter(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, delimiter='\t') self.assertEqual([row for row in csv], [{'id': '1', 'name': 'alpha', 'value': 'foobar'}, {'id': '2', 'name': 'unicode', 'value': '\xfan\xedc\xf3d\xe9\u203d'}, {'id': '3', 'name': 'multi', 'value': "foo\nbar"}, {'id': '4', 'name': 'empty', 'value': ''}])
def test_csviter_encoding(self): body1 = get_testdata("feeds", "feed-sample4.csv") body2 = get_testdata("feeds", "feed-sample5.csv") response = TextResponse(url="http://example.com/", body=body1, encoding="latin1") csv = csviter(response) self.assertEqual( list(csv), [ { "id": "1", "name": "latin1", "value": "test" }, { "id": "2", "name": "something", "value": "\xf1\xe1\xe9\xf3" }, ], ) response = TextResponse(url="http://example.com/", body=body2, encoding="cp852") csv = csviter(response) self.assertEqual( list(csv), [ { "id": "1", "name": "cp852", "value": "test" }, { "id": "2", "name": "something", "value": "\u255a\u2569\u2569\u2569\u2550\u2550\u2557", }, ], )
def test_csviter_delimiter_binary_response_assume_utf8_encoding(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t') response = Response(url="http://example.com/", body=body) csv = csviter(response, delimiter='\t') self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_wrong_quotechar(self): body = get_testdata('feeds', 'feed-sample6.csv') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{u"'id'": u"1", u"'name'": u"'alpha'", u"'value'": u"'foobar'"}, {u"'id'": u"2", u"'name'": u"'unicode'", u"'value'": u"'\xfan\xedc\xf3d\xe9\u203d'"}, {u"'id'": u"'3'", u"'name'": u"'multi'", u"'value'": u"'foo"}, {u"'id'": u"4", u"'name'": u"'empty'", u"'value'": u""}])
def test_csviter_exception(self): body = get_testdata("feeds", "feed-sample3.csv") response = TextResponse(url="http://example.com/", body=body) iter = csviter(response) next(iter) next(iter) next(iter) next(iter) self.assertRaises(StopIteration, next, iter)
def test_csviter_exception(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) iter = csviter(response) next(iter) next(iter) next(iter) next(iter) self.assertRaises(StopIteration, next, iter)
def _assert_bytes_received(self): self.assertEqual(9, len(self.run.bytes)) for request, data in self.run.bytes.items(): joined_data = b"".join(data) if self.run.getpath(request.url) == "/": self.assertEqual(joined_data, get_testdata("test_site", "index.html")) elif self.run.getpath(request.url) == "/item1.html": self.assertEqual(joined_data, get_testdata("test_site", "item1.html")) elif self.run.getpath(request.url) == "/item2.html": self.assertEqual(joined_data, get_testdata("test_site", "item2.html")) elif self.run.getpath(request.url) == "/redirected": self.assertEqual(joined_data, b"Redirected here") elif self.run.getpath(request.url) == '/redirect': self.assertEqual( joined_data, b"\n<html>\n" b" <head>\n" b" <meta http-equiv=\"refresh\" content=\"0;URL=/redirected\">\n" b" </head>\n" b" <body bgcolor=\"#FFFFFF\" text=\"#000000\">\n" b" <a href=\"/redirected\">click here</a>\n" b" </body>\n" b"</html>\n" ) elif self.run.getpath(request.url) == "/tem999.html": self.assertEqual( joined_data, b"\n<html>\n" b" <head><title>404 - No Such Resource</title></head>\n" b" <body>\n" b" <h1>No Such Resource</h1>\n" b" <p>File not found.</p>\n" b" </body>\n" b"</html>\n" ) elif self.run.getpath(request.url) == "/numbers": # signal was fired multiple times self.assertTrue(len(data) > 1) # bytes were received in order numbers = [str(x).encode("utf8") for x in range(2**14)] self.assertEqual(joined_data, b"".join(numbers))
def test_metadata_from_url_all_types(self, mock_get): expected = self.expected expected['url'] = self.url expected['status'] = '200 OK' mock_response = build_mock_response( url=self.url, content=get_testdata('songkick', 'tovestyrke.html'), ) mock_get.return_value = mock_response data = metadata_from_url(self.url) self.assertEqual(data, expected)
def test_w3c_rdfaprimer(self): for i in [5, 6, 7, 8, 9, 10, 11, 15]: fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i) print(fileprefix) body = get_testdata('w3crdfa', fileprefix + '.html') expected = json.loads( get_testdata('w3crdfa', fileprefix + '.expanded.json' ).decode('UTF-8')) rdfae = RDFaExtractor() data = rdfae.extract(body, base_url='http://www.example.com/index.html') self.assertJsonLDEqual(data, expected) # This is for testing that the fix to issue 116 does not affect # severely rdfa output even in a presence of a bug in the code def mocked_fix_order(x, y, z): raise Exception() rdfae._fix_order = mocked_fix_order data = rdfae.extract(body, base_url='http://www.example.com/index.html') self.assertJsonLDEqual(data, expected)
def test_csviter_falserow(self): body = get_testdata('feeds', 'feed-sample3.csv') body = b'\n'.join((body, b'a,b', b'a,b,c,d')) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_headers(self): sample = get_testdata('feeds', 'feed-sample3.csv').splitlines() headers, body = sample[0].split(b','), b'\n'.join(sample[1:]) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, headers=[h.decode('utf-8') for h in headers]) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_quotechar(self): body1 = get_testdata('feeds', 'feed-sample6.csv') body2 = get_testdata('feeds', 'feed-sample6.csv').replace(b',', b'|') response1 = TextResponse(url="http://example.com/", body=body1) csv1 = csviter(response1, quotechar="'") self.assertEqual([row for row in csv1], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}]) response2 = TextResponse(url="http://example.com/", body=body2) csv2 = csviter(response2, delimiter="|", quotechar="'") self.assertEqual([row for row in csv2], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')), ])
def test_main_all(self, mock_get): expected = self.expected expected['url'] = self.url expected['status'] = '200 OK' expected = json.dumps(expected, indent=2, sort_keys=True) mock_response = build_mock_response( url=self.url, content=get_testdata('songkick', 'tovestyrke.html'), ) mock_get.return_value = mock_response data = main([self.url]) self.assertEqual(data, expected)
def test_csviter_encoding(self): body1 = get_testdata('feeds', 'feed-sample4.csv') body2 = get_testdata('feeds', 'feed-sample5.csv') response = TextResponse(url="http://example.com/", body=body1, encoding='latin1') csv = csviter(response) self.assertEqual( list(csv), [ {'id': '1', 'name': 'latin1', 'value': 'test'}, {'id': '2', 'name': 'something', 'value': '\xf1\xe1\xe9\xf3'}, ] ) response = TextResponse(url="http://example.com/", body=body2, encoding='cp852') csv = csviter(response) self.assertEqual( list(csv), [ {'id': '1', 'name': 'cp852', 'value': 'test'}, {'id': '2', 'name': 'something', 'value': '\u255a\u2569\u2569\u2569\u2550\u2550\u2557'}, ] )
def test_metadata_from_url_opengraph_only(self, mock_get): expected = { 'opengraph': self.expected['opengraph'], 'url': self.url, 'status': '200 OK', } mock_response = build_mock_response( url=self.url, content=get_testdata('songkick', 'tovestyrke.html'), ) mock_get.return_value = mock_response data = metadata_from_url(self.url, syntaxes=['opengraph']) self.assertEqual(jsonize_dict(data), expected)
def test_main_single_syntax(self, mock_get): expected = { 'opengraph': self.expected['opengraph'], 'url': self.url, 'status': '200 OK', } expected = json.dumps(expected, indent=2, sort_keys=True) mock_response = build_mock_response( url=self.url, content=get_testdata('songkick', 'tovestyrke.html'), ) mock_get.return_value = mock_response data = main([self.url, '--syntax', 'opengraph']) self.assertEqual(data, expected)
def test_csviter_defaults(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) result = [row for row in csv] self.assertEqual(result, [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}]) # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: self.assertTrue(all((isinstance(k, six.text_type) for k in result_row.keys()))) self.assertTrue(all((isinstance(v, six.text_type) for v in result_row.values())))
def test_parse_rows(self): body = get_testdata('feeds', 'feed-sample6.csv') response = Response("http://example.org/dummy.csv", body=body) class _CrawlSpider(self.spider_class): name = "test" delimiter = "," quotechar = "'" def parse_row(self, response, row): return row spider = _CrawlSpider() rows = list(spider.parse_rows(response)) assert rows[0] == {'id': '1', 'name': 'alpha', 'value': 'foobar'} assert len(rows) == 4
def test_metadata_from_url_rdfa_only(self, mock_get): expected = { 'rdfa': self.expected['rdfa'], 'url': self.url, 'status': '200 OK', } mock_response = build_mock_response( url=self.url, content=get_testdata('songkick', 'tovestyrke.html'), ) mock_get.return_value = mock_response data = metadata_from_url(self.url, microdata=False, jsonld=False, rdfa=True) self.assertEqual(data, expected)
def test_umicroformat(self): expected = [{ '@context': 'http://microformats.org/wiki/', '@type': ['h-hidden-phone', 'h-hidden-tablet'], 'name': [''] }, { '@context': 'http://microformats.org/wiki/', '@type': ['h-hidden-phone'], 'children': [{ '@type': ['h-hidden-phone', 'h-hidden-tablet'], 'name': [''] }, { '@type': ['h-hidden-phone'], 'name': [ 'aJ Styles FastLane 2018 15 x ' '17 Framed Plaque w/ Ring ' 'Canvas' ], 'photo': [ '/on/demandware.static/-/Sites-main/default/dwa3227ee6/images/small/CN1148.jpg' ] }], }, { '@context': 'http://microformats.org/wiki/', '@type': ['h-entry'], 'author': [{ '@type': ['h-card'], 'name': ['W. Developer'], 'url': ['http://example.com'], 'value': 'W. Developer' }], 'content': [{ 'html': '<p>Blah blah blah</p>', 'value': 'Blah blah blah' }], 'name': ['Microformats are amazing'], 'published': ['2013-06-13 12:00:00'], 'summary': ['In which I extoll the virtues of using ' 'microformats.'] }] body = get_testdata('misc', 'microformat_test.html') data = extruct.extract(body, syntaxes=['microformat'], uniform=True) self.assertEqual(data['microformat'], expected)
def test_csviter_defaults(self): body = get_testdata("feeds", "feed-sample3.csv") response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) result = [row for row in csv] self.assertEqual( result, [ { "id": "1", "name": "alpha", "value": "foobar" }, { "id": "2", "name": "unicode", "value": "\xfan\xedc\xf3d\xe9\u203d" }, { "id": "3", "name": "multi", "value": FOOBAR_NL }, { "id": "4", "name": "empty", "value": "" }, ], ) # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: self.assertTrue( all((isinstance(k, str) for k in result_row.keys()))) self.assertTrue( all((isinstance(v, str) for v in result_row.values())))
def _links_response_no_href(self): body = get_testdata('link_extractor', 'linkextractor_no_href.html') resp = self.response_class('http://example.com/index', body=body) return resp
def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body)
def _links_response(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') resp = self.response_class('http://example.com/index', body=body) return resp
def setUp(self): body = get_testdata("link_extractor", "sgml_linkextractor.html") self.response = HtmlResponse(url="http://example.com/index", body=body)