示例#1
0
    def test_w3c_7_1(self):
        body = get_testdata("w3c", "microdata.7.1.html")
        expected = json.loads(get_testdata("w3c", "microdata.7.1.json").decode("UTF-8"))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, "http://blog.example.com/progress-report")
        self.assertDictEqual(data, expected)
示例#2
0
    def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(get_testdata('w3c', 'microdata.5.2.flat.json').decode('UTF-8'))

        mde = MicrodataExtractor(nested=False, strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
示例#3
0
    def test_w3c_object_element(self):
        body = get_testdata('w3c', 'microdata.object.html')
        expected = json.loads(get_testdata('w3c', 'microdata.object.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://www.example.com/microdata/test')
        self.assertDictEqual(data, expected)
示例#4
0
    def test_w3c_data_element(self):
        body = get_testdata('w3c', 'microdata.4.2.data.html')
        expected = json.loads(get_testdata('w3c', 'microdata.4.2.data.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
    def test_extraction_encoding(self):
        body = get_testdata("link_extractor", "linkextractor_noenc.html")
        response_utf8 = HtmlResponse(
            url="http://example.com/utf8", body=body, headers={"Content-Type": ["text/html; charset=utf-8"]}
        )
        response_noenc = HtmlResponse(url="http://example.com/noenc", body=body)
        body = get_testdata("link_extractor", "linkextractor_latin1.html")
        response_latin1 = HtmlResponse(url="http://example.com/latin1", body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(
            lx.extract_links(response_utf8),
            [
                Link(url="http://example.com/sample_%C3%B1.html", text=""),
                Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")),
            ],
        )

        self.assertEqual(
            lx.extract_links(response_noenc),
            [
                Link(url="http://example.com/sample_%C3%B1.html", text=""),
                Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")),
            ],
        )

        self.assertEqual(
            lx.extract_links(response_latin1),
            [
                Link(url="http://example.com/sample_%F1.html", text=""),
                Link(url="http://example.com/sample_%E1.html", text="sample \xe1 text".decode("latin1")),
            ],
        )
示例#6
0
    def test_w3c_5_2(self):
        body = get_testdata('w3c', 'microdata.5.2.html')
        expected = json.loads(get_testdata('w3c', 'microdata.5.2.withtext.json').decode('UTF-8'))

        mde = MicrodataExtractor(add_text_content=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
示例#7
0
    def test_w3c_7_1(self):
        body = get_testdata('w3c', 'microdata.7.1.html')
        expected = json.loads(get_testdata('w3c', 'microdata.7.1.json').decode('UTF-8'))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body, 'http://blog.example.com/progress-report')
        self.assertDictEqual(data, expected)
    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc', body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        # document encoding does not affect URL path component, only query part
        # >>> u'sample_ñ.html'.encode('utf8')
        # b'sample_\xc3\xb1.html'
        # >>> u"sample_á.html".encode('utf8')
        # b'sample_\xc3\xa1.html'
        # >>> u"sample_ö.html".encode('utf8')
        # b'sample_\xc3\xb6.html'
        # >>> u"£32".encode('latin1')
        # b'\xa332'
        # >>> u"µ".encode('latin1')
        # b'\xb5'
        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')),
            Link(url='http://example.com/sample_%C3%B6.html?price=%A332&%B5=unit', text=''),
        ])
示例#9
0
    def test_w3c_5_5(self):
        body = get_testdata("w3c", "microdata.5.5.html")
        expected = json.loads(get_testdata("w3c", "microdata.5.5.json").decode("UTF-8"))

        mde = MicrodataExtractor(strict=True)
        data = mde.extract(body)
        self.assertDictEqual(data, expected)
def _test_data(formats):
    uncompressed_body = get_testdata('compressed', 'feed-sample1.xml')
    test_responses = {}
    for format in formats:
        body = get_testdata('compressed', 'feed-sample1.' + format)
        test_responses[format] = Response('http://foo.com/bar', body=body)
    return uncompressed_body, test_responses
示例#11
0
    def test_schemaorg_MusicRecording(self):
        for i in [1]:
            body = get_testdata('schema.org', 'MusicRecording.{:03d}.html'.format(i))
            expected = json.loads(get_testdata('schema.org', 'MusicRecording.{:03d}.json'.format(i)).decode('UTF-8'))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertDictEqual(data, expected)
示例#12
0
    def test_schemaorg_CreativeWork(self):
        for i in [1]:
            body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i))
            expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertDictEqual(data, expected)
示例#13
0
    def test_schemaorg_Event(self):
        for i in [1, 2, 3, 4, 8]:
            body = get_testdata("schema.org", "Event.{:03d}.html".format(i))
            expected = json.loads(get_testdata("schema.org", "Event.{:03d}.json".format(i)).decode("UTF-8"))

            mde = MicrodataExtractor()
            data = mde.extract(body)
            self.assertDictEqual(data, expected)
示例#14
0
    def test_wikipedia_xhtml_rdfa(self):
        fileprefix = 'xhtml+rdfa'
        body = get_testdata('wikipedia', fileprefix + '.html').decode('UTF-8')
        expected = json.loads(
                get_testdata('wikipedia', fileprefix + '.expanded.json'
            ).decode('UTF-8'))

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, url='http://www.exaple.com/index.html')

        self.assertJsonLDEqual(data, expected)
示例#15
0
    def test_w3c_rdf11primer(self):
        for i in [14]:
            fileprefix = 'w3c.rdf11primer.example{:03d}'.format(i)
            body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8')
            expected = json.loads(
                    get_testdata('w3crdfa', fileprefix + '.expanded.json'
                ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.exaple.com/index.html')
            self.assertJsonLDEqual(data, expected)
示例#16
0
    def test_songkick(self):
        for page in [
                "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015",
                #"Maxïmo Park Gigography, Tour History & Past Concerts",
                #"Years & Years Tickets, Tour Dates 2015 & Concerts",
            ]:
            body = get_testdata('songkick', '{}.html'.format(page))
            expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertDictEqual(data, expected)
示例#17
0
    def test_csviter_encoding(self):
        body1 = get_testdata('feeds', 'feed-sample4.csv')
        body2 = get_testdata('feeds', 'feed-sample5.csv')

        response = TextResponse(url="http://example.com/", body=body1, encoding='latin1')
        csv = csviter(response)
        self.assertEqual([row for row in csv],
            [{u'id': u'1', u'name': u'latin1', u'value': u'test'},
             {u'id': u'2', u'name': u'something', u'value': u'\xf1\xe1\xe9\xf3'}])

        response = TextResponse(url="http://example.com/", body=body2, encoding='cp852')
        csv = csviter(response)
        self.assertEqual([row for row in csv],
            [{u'id': u'1', u'name': u'cp852', u'value': u'test'},
             {u'id': u'2', u'name': u'something', u'value': u'\u255a\u2569\u2569\u2569\u2550\u2550\u2557'}])
示例#18
0
    def test_csviter_falserow(self):
        body = get_testdata('feeds', 'feed-sample3.csv')
        body = b'\n'.join((body, b'a,b', b'a,b,c,d'))

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        self.assertEqual([row for row in csv],
                         [{
                             u'id': u'1',
                             u'name': u'alpha',
                             u'value': u'foobar'
                         }, {
                             u'id': u'2',
                             u'name': u'unicode',
                             u'value': u'\xfan\xedc\xf3d\xe9\u203d'
                         }, {
                             u'id': u'3',
                             u'name': u'multi',
                             u'value': FOOBAR_NL
                         }, {
                             u'id': u'4',
                             u'name': u'empty',
                             u'value': u''
                         }])
示例#19
0
    def test_csviter_headers(self):
        sample = get_testdata('feeds', 'feed-sample3.csv').splitlines()
        headers, body = sample[0].split(','), '\n'.join(sample[1:])

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response, headers=headers)

        self.assertEqual([row for row in csv],
                         [{
                             u'id': u'1',
                             u'name': u'alpha',
                             u'value': u'foobar'
                         }, {
                             u'id': u'2',
                             u'name': u'unicode',
                             u'value': u'\xfan\xedc\xf3d\xe9\u203d'
                         }, {
                             u'id': u'3',
                             u'name': u'multi',
                             u'value': u'foo\nbar'
                         }, {
                             u'id': u'4',
                             u'name': u'empty',
                             u'value': u''
                         }])
示例#20
0
 def test_umicrodata(self):
     expected = [{
         "@context": "http://schema.org",
         "@type": "Product",
         "brand": "ACME",
         "name": "Executive Anvil",
         "image": "anvil_executive.jpg",
         "description":
         "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
         "mpn": "925872",
         "aggregateRating": {
             "@type": "AggregateRating",
             "ratingValue": "4.4",
             "reviewCount": "89"
         },
         "offers": {
             "@type": "Offer",
             "priceCurrency": "USD",
             "price": "119.99",
             "priceValidUntil": "2020-11-05",
             "seller": {
                 "@type": "Organization",
                 "name": "Executive Objects"
             },
             "itemCondition": "http://schema.org/UsedCondition",
             "availability": "http://schema.org/InStock"
         }
     }]
     body = get_testdata('misc', 'product_microdata.html')
     data = extruct.extract(body, syntaxes=['microdata'], uniform=True)
     self.assertEqual(data['microdata'], expected)
示例#21
0
 def test_uopengraph(self):
     expected = [{
         "@context": {
             "og": "http://ogp.me/ns#",
             "fb": "http://www.facebook.com/2008/fbml",
             "concerts": "http://ogp.me/ns/fb/songkick-concerts#"
         },
         "fb:app_id":
         "308540029359",
         "og:site_name":
         "Songkick",
         "@type":
         "songkick-concerts:artist",
         "og:title":
         "Elysian Fields",
         "og:description":
         "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.",
         "og:url":
         "http://www.songkick.com/artists/236156-elysian-fields",
         "og:image":
         "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg",
     }]
     body = get_testdata('songkick', 'elysianfields.html')
     data = extruct.extract(body, syntaxes=['opengraph'], uniform=True)
     self.assertEqual(data['opengraph'], expected)
示例#22
0
    def test_csviter_defaults(self):
        body = get_testdata('feeds', 'feed-sample3.csv')
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        result = [row for row in csv]
        self.assertEqual(result, [{
            'id': '1',
            'name': 'alpha',
            'value': 'foobar'
        }, {
            'id': '2',
            'name': 'unicode',
            'value': '\xfan\xedc\xf3d\xe9\u203d'
        }, {
            'id': '3',
            'name': 'multi',
            'value': "foo\nbar"
        }, {
            'id': '4',
            'name': 'empty',
            'value': ''
        }])

        # explicit type check cuz' we no like stinkin' autocasting! yarrr
        for result_row in result:
            self.assertTrue(
                all((isinstance(k, str) for k in result_row.keys())))
            self.assertTrue(
                all((isinstance(v, str) for v in result_row.values())))
示例#23
0
    def test_csviter_headers(self):
        sample = get_testdata('feeds', 'feed-sample3.csv').splitlines()
        headers, body = sample[0].split(b','), b'\n'.join(sample[1:])

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response, headers=[h.decode('utf-8') for h in headers])

        self.assertEqual([row for row in csv],
                         [{
                             'id': '1',
                             'name': 'alpha',
                             'value': 'foobar'
                         }, {
                             'id': '2',
                             'name': 'unicode',
                             'value': '\xfan\xedc\xf3d\xe9\u203d'
                         }, {
                             'id': '3',
                             'name': 'multi',
                             'value': 'foo\nbar'
                         }, {
                             'id': '4',
                             'name': 'empty',
                             'value': ''
                         }])
示例#24
0
    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa', fileprefix + '.html').decode('UTF-8')
            expected = json.loads(
                    get_testdata('w3crdfa', fileprefix + '.expanded.json'
                ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.example.com/index.html')
            print("extracted:\n%s" % pformat(tupleize(data)))
            print("expected:\n%s" % pformat(tupleize(expected)))
            print("extracted:\n%s" % self.prettify(data))
            print("expected:\n%s" % self.prettify(expected))
            self.assertJsonLDEqual(data, expected)
示例#25
0
    def test_csviter_quotechar(self):
        body1 = get_testdata('feeds', 'feed-sample6.csv')
        body2 = get_testdata('feeds', 'feed-sample6.csv').replace(b',', b'|')

        response1 = TextResponse(url="http://example.com/", body=body1)
        csv1 = csviter(response1, quotechar="'")

        self.assertEqual([row for row in csv1],
                         [{
                             u'id': u'1',
                             u'name': u'alpha',
                             u'value': u'foobar'
                         }, {
                             u'id': u'2',
                             u'name': u'unicode',
                             u'value': u'\xfan\xedc\xf3d\xe9\u203d'
                         }, {
                             u'id': u'3',
                             u'name': u'multi',
                             u'value': FOOBAR_NL
                         }, {
                             u'id': u'4',
                             u'name': u'empty',
                             u'value': u''
                         }])

        response2 = TextResponse(url="http://example.com/", body=body2)
        csv2 = csviter(response2, delimiter="|", quotechar="'")

        self.assertEqual([row for row in csv2],
                         [{
                             u'id': u'1',
                             u'name': u'alpha',
                             u'value': u'foobar'
                         }, {
                             u'id': u'2',
                             u'name': u'unicode',
                             u'value': u'\xfan\xedc\xf3d\xe9\u203d'
                         }, {
                             u'id': u'3',
                             u'name': u'multi',
                             u'value': FOOBAR_NL
                         }, {
                             u'id': u'4',
                             u'name': u'empty',
                             u'value': u''
                         }])
示例#26
0
    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa',
                                fileprefix + '.html').decode('UTF-8')
            expected = json.loads(
                get_testdata('w3crdfa',
                             fileprefix + '.expanded.json').decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, url='http://www.example.com/index.html')
            print("extracted:\n%s" % pformat(tupleize(data)))
            print("expected:\n%s" % pformat(tupleize(expected)))
            print("extracted:\n%s" % self.prettify(data))
            print("expected:\n%s" % self.prettify(expected))
            self.assertJsonLDEqual(data, expected)
示例#27
0
    def test_jsonld_with_comments(self):
        for prefix in ['JoinAction.001', 'AllocateAction.001']:
            body = get_testdata('schema.org.invalid', '{}.html'.format(prefix))
            name = '{}.jsonld'.format(prefix)
            expected = json.loads(get_testdata('schema.org.invalid', name).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertEqual(data, expected)
        for prefix in ['JoinAction.001',
                       'AllocateAction.001',
                ]:
            body = get_testdata('custom.invalid', '{}.html'.format(prefix))
            expected = json.loads(get_testdata('custom.invalid', '{}.jsonld'.format(prefix)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertEqual(data, expected)
示例#28
0
 def test_udublincore(self):
     expected = [{
         'elements': [{
             'name': 'DC.title',
             'lang': 'en',
             'content':
             'Expressing Dublin Core\nin HTML/XHTML meta and link elements',
             'URI': 'http://purl.org/dc/elements/1.1/title'
         }, {
             'name': 'DC.creator',
             'content': 'Andy Powell, UKOLN, University of Bath',
             'URI': 'http://purl.org/dc/elements/1.1/creator'
         }, {
             'name': 'DC.identifier',
             'scheme': 'DCTERMS.URI',
             'content': 'http://dublincore.org/documents/dcq-html/',
             'URI': 'http://purl.org/dc/elements/1.1/identifier'
         }, {
             'name': 'DC.format',
             'scheme': 'DCTERMS.IMT',
             'content': 'text/html',
             'URI': 'http://purl.org/dc/elements/1.1/format'
         }],
         'terms': [{
             'name': 'DCTERMS.issued',
             'scheme': 'DCTERMS.W3CDTF',
             'content': '2003-11-01',
             'URI': 'http://purl.org/dc/terms/issued'
         }, {
             'name': 'DCTERMS.abstract',
             'content':
             'This document describes how\nqualified Dublin Core metadata can be encoded\nin HTML/XHTML <meta> elements',
             'URI': 'http://purl.org/dc/terms/abstract'
         }, {
             'name': 'DC.Date.modified',
             'content': '2001-07-18',
             'URI': 'http://purl.org/dc/terms/modified'
         }, {
             'name': 'DCTERMS.modified',
             'content': '2001-07-18',
             'URI': 'http://purl.org/dc/terms/modified'
         }, {
             'rel': 'DCTERMS.replaces',
             'hreflang': 'en',
             'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/',
             'URI': 'http://purl.org/dc/terms/replaces'
         }],
         '@context': {
             'DC': 'http://purl.org/dc/elements/1.1/',
             'DCTERMS': 'http://purl.org/dc/terms/'
         },
         '@type':
         'Text'
     }]
     body = get_testdata('misc', 'dublincore_test.html')
     data = extruct.extract(body, syntaxes=['dublincore'], uniform=True)
     self.assertEqual(data['dublincore'], expected)
    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8',
            body=body,
            headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                                      body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                                       body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        # document encoding does not affect URL path component, only query part
        # >>> u'sample_ñ.html'.encode('utf8')
        # b'sample_\xc3\xb1.html'
        # >>> u"sample_á.html".encode('utf8')
        # b'sample_\xc3\xa1.html'
        # >>> u"sample_ö.html".encode('utf8')
        # b'sample_\xc3\xb6.html'
        # >>> u"£32".encode('latin1')
        # b'\xa332'
        # >>> u"µ".encode('latin1')
        # b'\xb5'
        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%C3%A1.html',
                 text='sample \xe1 text'.decode('latin1')),
            Link(url=
                 'http://example.com/sample_%C3%B6.html?price=%A332&%B5=unit',
                 text=''),
        ])
    def test_csviter_delimiter(self):
        body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t')
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response, delimiter='\t')

        self.assertEqual([row for row in csv],
                         [{'id': '1', 'name': 'alpha', 'value': 'foobar'},
                          {'id': '2', 'name': 'unicode', 'value': '\xfan\xedc\xf3d\xe9\u203d'},
                          {'id': '3', 'name': 'multi', 'value': "foo\nbar"},
                          {'id': '4', 'name': 'empty', 'value': ''}])
示例#31
0
    def test_csviter_encoding(self):
        body1 = get_testdata("feeds", "feed-sample4.csv")
        body2 = get_testdata("feeds", "feed-sample5.csv")

        response = TextResponse(url="http://example.com/",
                                body=body1,
                                encoding="latin1")
        csv = csviter(response)
        self.assertEqual(
            list(csv),
            [
                {
                    "id": "1",
                    "name": "latin1",
                    "value": "test"
                },
                {
                    "id": "2",
                    "name": "something",
                    "value": "\xf1\xe1\xe9\xf3"
                },
            ],
        )

        response = TextResponse(url="http://example.com/",
                                body=body2,
                                encoding="cp852")
        csv = csviter(response)
        self.assertEqual(
            list(csv),
            [
                {
                    "id": "1",
                    "name": "cp852",
                    "value": "test"
                },
                {
                    "id": "2",
                    "name": "something",
                    "value": "\u255a\u2569\u2569\u2569\u2550\u2550\u2557",
                },
            ],
        )
    def test_csviter_delimiter_binary_response_assume_utf8_encoding(self):
        body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t')
        response = Response(url="http://example.com/", body=body)
        csv = csviter(response, delimiter='\t')

        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
示例#33
0
    def test_csviter_wrong_quotechar(self):
        body = get_testdata('feeds', 'feed-sample6.csv')
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        self.assertEqual([row for row in csv],
                         [{u"'id'": u"1",   u"'name'": u"'alpha'",   u"'value'": u"'foobar'"},
                          {u"'id'": u"2",   u"'name'": u"'unicode'", u"'value'": u"'\xfan\xedc\xf3d\xe9\u203d'"},
                          {u"'id'": u"'3'", u"'name'": u"'multi'",   u"'value'": u"'foo"},
                          {u"'id'": u"4",   u"'name'": u"'empty'",   u"'value'": u""}])
    def test_csviter_wrong_quotechar(self):
        body = get_testdata('feeds', 'feed-sample6.csv')
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        self.assertEqual([row for row in csv],
                         [{u"'id'": u"1",   u"'name'": u"'alpha'",   u"'value'": u"'foobar'"},
                          {u"'id'": u"2",   u"'name'": u"'unicode'", u"'value'": u"'\xfan\xedc\xf3d\xe9\u203d'"},
                          {u"'id'": u"'3'", u"'name'": u"'multi'",   u"'value'": u"'foo"},
                          {u"'id'": u"4",   u"'name'": u"'empty'",   u"'value'": u""}])
示例#35
0
    def test_csviter_delimiter_binary_response_assume_utf8_encoding(self):
        body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t')
        response = Response(url="http://example.com/", body=body)
        csv = csviter(response, delimiter='\t')

        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
示例#36
0
    def test_csviter_exception(self):
        body = get_testdata("feeds", "feed-sample3.csv")

        response = TextResponse(url="http://example.com/", body=body)
        iter = csviter(response)
        next(iter)
        next(iter)
        next(iter)
        next(iter)

        self.assertRaises(StopIteration, next, iter)
示例#37
0
    def test_csviter_exception(self):
        body = get_testdata('feeds', 'feed-sample3.csv')

        response = TextResponse(url="http://example.com/", body=body)
        iter = csviter(response)
        next(iter)
        next(iter)
        next(iter)
        next(iter)

        self.assertRaises(StopIteration, next, iter)
示例#38
0
 def _assert_bytes_received(self):
     self.assertEqual(9, len(self.run.bytes))
     for request, data in self.run.bytes.items():
         joined_data = b"".join(data)
         if self.run.getpath(request.url) == "/":
             self.assertEqual(joined_data, get_testdata("test_site", "index.html"))
         elif self.run.getpath(request.url) == "/item1.html":
             self.assertEqual(joined_data, get_testdata("test_site", "item1.html"))
         elif self.run.getpath(request.url) == "/item2.html":
             self.assertEqual(joined_data, get_testdata("test_site", "item2.html"))
         elif self.run.getpath(request.url) == "/redirected":
             self.assertEqual(joined_data, b"Redirected here")
         elif self.run.getpath(request.url) == '/redirect':
             self.assertEqual(
                 joined_data,
                 b"\n<html>\n"
                 b"    <head>\n"
                 b"        <meta http-equiv=\"refresh\" content=\"0;URL=/redirected\">\n"
                 b"    </head>\n"
                 b"    <body bgcolor=\"#FFFFFF\" text=\"#000000\">\n"
                 b"    <a href=\"/redirected\">click here</a>\n"
                 b"    </body>\n"
                 b"</html>\n"
             )
         elif self.run.getpath(request.url) == "/tem999.html":
             self.assertEqual(
                 joined_data,
                 b"\n<html>\n"
                 b"  <head><title>404 - No Such Resource</title></head>\n"
                 b"  <body>\n"
                 b"    <h1>No Such Resource</h1>\n"
                 b"    <p>File not found.</p>\n"
                 b"  </body>\n"
                 b"</html>\n"
             )
         elif self.run.getpath(request.url) == "/numbers":
             # signal was fired multiple times
             self.assertTrue(len(data) > 1)
             # bytes were received in order
             numbers = [str(x).encode("utf8") for x in range(2**14)]
             self.assertEqual(joined_data, b"".join(numbers))
示例#39
0
    def test_metadata_from_url_all_types(self, mock_get):
        expected = self.expected
        expected['url'] = self.url
        expected['status'] = '200 OK'
        mock_response = build_mock_response(
            url=self.url,
            content=get_testdata('songkick', 'tovestyrke.html'),
        )
        mock_get.return_value = mock_response

        data = metadata_from_url(self.url)
        self.assertEqual(data, expected)
示例#40
0
    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = 'w3c.rdfaprimer.example{:03d}'.format(i)
            print(fileprefix)
            body = get_testdata('w3crdfa', fileprefix + '.html')
            expected = json.loads(
                       get_testdata('w3crdfa', fileprefix + '.expanded.json'
                       ).decode('UTF-8'))

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)

            # This is for testing that the fix to issue 116 does not affect
            # severely rdfa output even in a presence of a bug in the code
            def mocked_fix_order(x, y, z):
                raise Exception()

            rdfae._fix_order = mocked_fix_order
            data = rdfae.extract(body, base_url='http://www.example.com/index.html')
            self.assertJsonLDEqual(data, expected)
示例#41
0
    def test_csviter_falserow(self):
        body = get_testdata('feeds', 'feed-sample3.csv')
        body = b'\n'.join((body, b'a,b', b'a,b,c,d'))

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
示例#42
0
    def test_csviter_headers(self):
        sample = get_testdata('feeds', 'feed-sample3.csv').splitlines()
        headers, body = sample[0].split(b','), b'\n'.join(sample[1:])

        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response, headers=[h.decode('utf-8') for h in headers])

        self.assertEqual([row for row in csv],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': u'foo\nbar'},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
示例#43
0
    def test_csviter_quotechar(self):
        body1 = get_testdata('feeds', 'feed-sample6.csv')
        body2 = get_testdata('feeds', 'feed-sample6.csv').replace(b',', b'|')

        response1 = TextResponse(url="http://example.com/", body=body1)
        csv1 = csviter(response1, quotechar="'")

        self.assertEqual([row for row in csv1],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])

        response2 = TextResponse(url="http://example.com/", body=body2)
        csv2 = csviter(response2, delimiter="|", quotechar="'")

        self.assertEqual([row for row in csv2],
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
示例#44
0
    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc', body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')),
        ])
示例#45
0
    def test_main_all(self, mock_get):
        expected = self.expected
        expected['url'] = self.url
        expected['status'] = '200 OK'
        expected = json.dumps(expected, indent=2, sort_keys=True)
        mock_response = build_mock_response(
            url=self.url,
            content=get_testdata('songkick', 'tovestyrke.html'),
        )
        mock_get.return_value = mock_response

        data = main([self.url])
        self.assertEqual(data, expected)
    def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc', body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)

        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')),
        ])
    def test_csviter_encoding(self):
        body1 = get_testdata('feeds', 'feed-sample4.csv')
        body2 = get_testdata('feeds', 'feed-sample5.csv')

        response = TextResponse(url="http://example.com/", body=body1, encoding='latin1')
        csv = csviter(response)
        self.assertEqual(
            list(csv),
            [
                {'id': '1', 'name': 'latin1', 'value': 'test'},
                {'id': '2', 'name': 'something', 'value': '\xf1\xe1\xe9\xf3'},
            ]
        )

        response = TextResponse(url="http://example.com/", body=body2, encoding='cp852')
        csv = csviter(response)
        self.assertEqual(
            list(csv),
            [
                {'id': '1', 'name': 'cp852', 'value': 'test'},
                {'id': '2', 'name': 'something', 'value': '\u255a\u2569\u2569\u2569\u2550\u2550\u2557'},
            ]
        )
示例#48
0
    def test_metadata_from_url_opengraph_only(self, mock_get):
        expected = {
            'opengraph': self.expected['opengraph'],
            'url': self.url,
            'status': '200 OK',
        }
        mock_response = build_mock_response(
            url=self.url,
            content=get_testdata('songkick', 'tovestyrke.html'),
        )
        mock_get.return_value = mock_response

        data = metadata_from_url(self.url, syntaxes=['opengraph'])
        self.assertEqual(jsonize_dict(data), expected)
示例#49
0
    def test_main_single_syntax(self, mock_get):
        expected = {
            'opengraph': self.expected['opengraph'],
            'url': self.url,
            'status': '200 OK',
        }
        expected = json.dumps(expected, indent=2, sort_keys=True)
        mock_response = build_mock_response(
            url=self.url,
            content=get_testdata('songkick', 'tovestyrke.html'),
        )
        mock_get.return_value = mock_response

        data = main([self.url, '--syntax', 'opengraph'])
        self.assertEqual(data, expected)
示例#50
0
    def test_csviter_defaults(self):
        body = get_testdata('feeds', 'feed-sample3.csv')
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        result = [row for row in csv]
        self.assertEqual(result,
                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
                          {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
                          {u'id': u'4', u'name': u'empty',   u'value': u''}])

        # explicit type check cuz' we no like stinkin' autocasting! yarrr
        for result_row in result:
            self.assertTrue(all((isinstance(k, six.text_type) for k in result_row.keys())))
            self.assertTrue(all((isinstance(v, six.text_type) for v in result_row.values())))
示例#51
0
    def test_parse_rows(self):
        body = get_testdata('feeds', 'feed-sample6.csv')
        response = Response("http://example.org/dummy.csv", body=body)

        class _CrawlSpider(self.spider_class):
            name = "test"
            delimiter = ","
            quotechar = "'"

            def parse_row(self, response, row):
                return row

        spider = _CrawlSpider()
        rows = list(spider.parse_rows(response))
        assert rows[0] == {'id': '1', 'name': 'alpha', 'value': 'foobar'}
        assert len(rows) == 4
示例#52
0
    def test_metadata_from_url_rdfa_only(self, mock_get):
        expected = {
            'rdfa': self.expected['rdfa'],
            'url': self.url,
            'status': '200 OK',
        }
        mock_response = build_mock_response(
            url=self.url,
            content=get_testdata('songkick', 'tovestyrke.html'),
        )
        mock_get.return_value = mock_response

        data = metadata_from_url(self.url,
                                 microdata=False,
                                 jsonld=False,
                                 rdfa=True)
        self.assertEqual(data, expected)
示例#53
0
 def test_umicroformat(self):
     expected = [{
         '@context': 'http://microformats.org/wiki/',
         '@type': ['h-hidden-phone', 'h-hidden-tablet'],
         'name': ['']
     }, {
         '@context':
         'http://microformats.org/wiki/',
         '@type': ['h-hidden-phone'],
         'children': [{
             '@type': ['h-hidden-phone', 'h-hidden-tablet'],
             'name': ['']
         }, {
             '@type': ['h-hidden-phone'],
             'name': [
                 'aJ Styles FastLane 2018 15 x '
                 '17 Framed Plaque w/ Ring '
                 'Canvas'
             ],
             'photo': [
                 '/on/demandware.static/-/Sites-main/default/dwa3227ee6/images/small/CN1148.jpg'
             ]
         }],
     }, {
         '@context':
         'http://microformats.org/wiki/',
         '@type': ['h-entry'],
         'author': [{
             '@type': ['h-card'],
             'name': ['W. Developer'],
             'url': ['http://example.com'],
             'value': 'W. Developer'
         }],
         'content': [{
             'html': '<p>Blah blah blah</p>',
             'value': 'Blah blah blah'
         }],
         'name': ['Microformats are amazing'],
         'published': ['2013-06-13 12:00:00'],
         'summary':
         ['In which I extoll the virtues of using '
          'microformats.']
     }]
     body = get_testdata('misc', 'microformat_test.html')
     data = extruct.extract(body, syntaxes=['microformat'], uniform=True)
     self.assertEqual(data['microformat'], expected)
示例#54
0
    def test_csviter_defaults(self):
        body = get_testdata("feeds", "feed-sample3.csv")
        response = TextResponse(url="http://example.com/", body=body)
        csv = csviter(response)

        result = [row for row in csv]
        self.assertEqual(
            result,
            [
                {
                    "id": "1",
                    "name": "alpha",
                    "value": "foobar"
                },
                {
                    "id": "2",
                    "name": "unicode",
                    "value": "\xfan\xedc\xf3d\xe9\u203d"
                },
                {
                    "id": "3",
                    "name": "multi",
                    "value": FOOBAR_NL
                },
                {
                    "id": "4",
                    "name": "empty",
                    "value": ""
                },
            ],
        )

        # explicit type check cuz' we no like stinkin' autocasting! yarrr
        for result_row in result:
            self.assertTrue(
                all((isinstance(k, str) for k in result_row.keys())))
            self.assertTrue(
                all((isinstance(v, str) for v in result_row.values())))
示例#55
0
 def _links_response_no_href(self):
     body = get_testdata('link_extractor', 'linkextractor_no_href.html')
     resp = self.response_class('http://example.com/index', body=body)
     return resp
 def setUp(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     self.response = HtmlResponse(url='http://example.com/index', body=body)
示例#57
0
 def _links_response(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     resp = self.response_class('http://example.com/index', body=body)
     return resp
 def setUp(self):
     body = get_testdata("link_extractor", "sgml_linkextractor.html")
     self.response = HtmlResponse(url="http://example.com/index", body=body)
 def setUp(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     self.response = HtmlResponse(url='http://example.com/index', body=body)