def test_maybe_convert_dedupe_urls(self): """Only convert the first response for a URL, then ignore the rest.""" record = warc_record(warc_response('foo', 'http://foo')) self.assertEqual(BIGQUERY_JSON, maybe_convert(record, 'foo')) record2 = warc_record(warc_response('bar', 'http://foo')) self.assertIsNone(maybe_convert(record2, 'foo'))
def test_other_domains(self): """We keep subdomains, but discard other domains.""" for url in 'http://foo.com/1', 'http://sub.foo.com/2', 'http://foo.com:80/3': with self.subTest(url=url): self.assertIsNotNone( maybe_convert(warc_record(warc_response('X', url)), 'foo.com')) self.assertIsNone( maybe_convert(warc_record(warc_response('3', 'http://bar.com/3')), 'foo.com'))
def test_mf2py_crash_lxml_and_html5lib(self, _): """https://github.com/tommorris/mf2py/issues/78""" got = maybe_convert(warc_record(warc_response('X', 'http://foo')), 'foo') self.assertEqual('{}', got['mf2']) for key in 'mf2_classes', 'u_urls', 'rels': self.assertEqual([], got[key])
def test_composite_u_url(self): """u-url and h-entry on the same tag results in a composite url property. Make sure we handle that and extract out the string url inside. Parsed mf2: {"items": [{ "type": ["h-feed"], "properties": { "url": [{ "type": ["h-entry"], "properties": { "summary": ["Two fresh data sets"], "name": ["Two fresh data sets"], "url": ["../X1Ya.html"] }, "value": "../X1Ya.html" } ], ... }}]} """ record = warc_record( warc_response( """\ <div class="h-feed"> <a class="h-entry u-url" href="../X1Ya.html"> <p class="p-summary">Two fresh data sets</p> </a> </div> """, 'http://foo')) self.assertEqual(['http://foo/X1Ya.html'], maybe_convert(record, 'foo')['u_urls'])
def test_rels(self): for i, (content, expected) in enumerate(( ('', []), ('<link rel="foo" href="http://x">', [{ 'value': 'foo', 'urls': ['http://x'] }]), ('<link rel="foo bar" href="http://x">', [{ 'value': 'foo', 'urls': ['http://x'] }, { 'value': 'bar', 'urls': ['http://x'] }]), ('<link rel="foo" href="http://x"> <link rel="bar foo" href="http://y">', [{ 'value': 'foo', 'urls': ['http://x', 'http://y'] }, { 'value': 'bar', 'urls': ['http://y'] }]), )): record = warc_record(warc_response(content, 'http://foo/%s' % i)) actual = maybe_convert(record, 'foo')['rels'] with self.subTest(content=content): self.assertEqual(expected, actual)
def test_max_row_size(self): """Discovered by http://www.downes.ca/research_authors.htm , ~45MB single HTML file with a bunch of big embedded data: URIs.""" got = maybe_convert(warc_record(warc_response('X', 'http://foo')), 'foo') self.assertEqual(warc_to_bigquery.MAX_ROW_MESSAGE, got['html']) self.assertEqual({warc_to_bigquery.MAX_ROW_MESSAGE: None}, json.loads(got['mf2']))
def test_utf8_url_and_html(self): url = 'http://foo/☕/post' body = 'Charles ☕ Foo' out = maybe_convert(warc_record(warc_response(body, url) + '\r\n\r\n'), 'foo') self.assertIn(body, out['html']) url += '/1' got = self._run_main((warc_response(body, url) + '\r\n\r\n', )) self.assertEqual(url, got['url']) self.assertIn(body, got['html'])
def test_links(self): record = warc_record( warc_response( """\ foo <a href="#frag"></a> bar <a class="x" rel="a b" href="/local">bar</a> baz <a class="y u-in-reply-to" href="http://ext/ernal">baz</a> baj <a class="u-repost-of z" href="http://ext/ernal"><img src="/baj"></a> baj <link rel="c" class="w" href="http://link/tag" /> biff <a rel="c" class="w" /> <!-- no hrefs, these should be ignored --> biff <a rel="c" class="w" href="" /> """, 'http://foo', html_head='<link rel="d e" href="https://head/link">')) self.assertEqual([{ 'url': 'https://head/link', 'inner_html': '', 'tag': 'link', 'rels': ['d', 'e'], 'classes': [], }, { 'url': 'http://link/tag', 'inner_html': '', 'tag': 'link', 'rels': ['c'], 'classes': ['w'], }, { 'url': '#frag', 'inner_html': '', 'tag': 'a', 'rels': [], 'classes': [], }, { 'url': '/local', 'inner_html': 'bar', 'tag': 'a', 'rels': ['a', 'b'], 'classes': ['x'], }, { 'url': 'http://ext/ernal', 'inner_html': 'baz', 'tag': 'a', 'rels': [], 'classes': ['y', 'u-in-reply-to'], }, { 'url': 'http://ext/ernal', 'inner_html': '<img src="/baj"/>', 'tag': 'a', 'rels': [], 'classes': ['u-repost-of', 'z'], }], maybe_convert(record, 'foo')['links'])
def test_maybe_convert(self): foo_record = warc_record(warc_response('foo', 'http://foo')) self.assertEqual(BIGQUERY_JSON, maybe_convert(foo_record, 'foo')) bar_record = warc_record( warc_response('bar', 'http://bar', extra_headers={'XYZ': 'Baz'})) bar_json = copy.deepcopy(BIGQUERY_JSON) bar_json.update({ 'domain': 'bar', 'url': 'http://bar', 'html': HTML % ('', 'bar'), 'headers': bar_json['headers'] + [{ 'name': 'XYZ', 'value': 'Baz' }], }) self.assertEqual(bar_json, maybe_convert(bar_record, 'bar'))
def test_max_links(self): """Discovered by pages on werd.io with lots of spam, e.g. http://werd.io/2014/why-cant-you-comment-on-this-post-indieweb , before Ben cleaned them up.""" got = maybe_convert( warc_record( warc_response( """\ <a href="http://one"></a> <a href="http://two"></a> <a href="http://three"></a> """, 'http://foo')), 'foo') self.assertEqual(2, len(got['links']))
def test_url_blacklist(self): for path in ( '/foo/bar?shared=email&x', '/?share=facebook', '/?x&share=tumblr', '/?like_comment=123', '/?x&replytocom=456', '/wp-login.php?redirect_to=qwert', ): self.assertIsNone( maybe_convert( warc_record(warc_response('', 'http://foo%s' % path)), 'foo'))
def test_microformats(self): for i, (content, expected) in enumerate(( ('', []), ('foo', []), ('<div class="h-entry"></div>', ['h-entry']), ('<div class="h-entry">1</div> <div class="h-entry">2</div>', ['h-entry']), ('<div class="h-entry h-card"></div>', ['h-card', 'h-entry']), ('<div class="h-feed"><div class="h-entry"><div class="h-card">' '</div></div></div> <div class="h-adr"></div>', ['h-adr', 'h-card', 'h-entry', 'h-feed']), # microformats1 backward compatibility ('<div class="hentry"></div>', ['h-entry']), )): record = warc_record(warc_response(content, 'http://foo/%s' % i)) actual = maybe_convert(record, 'foo')['mf2_classes'] with self.subTest(content=content): self.assertEqual(expected, actual)
def test_u_urls(self): for i, (content, expected) in enumerate(( ('', []), ('foo', []), ('<div class="h-entry"></div>', []), ('<div class="h-entry"><a class="u-url" href="http://foo" /></div>', ['http://foo']), ('<div class="h-entry"><a class="u-url" href="http://foo" /></div>' '<div class="h-entry"><a class="u-url" href="http://bar" /></div>', ['http://foo', 'http://bar']), ('<div class="h-feed"><div class="h-entry">' '<a class="u-url" href="http://foo" /></div></div>', []), # microformats1 backward compatibility # http://microformats.org/wiki/rel-bookmark#rel.3D.22bookmark.22 ('<div class="hentry"><a rel="bookmark" href="http://baz" /></div>', ['http://baz']), )): record = warc_record(warc_response(content, 'http://foo/%s' % i)) actual = maybe_convert(record, 'foo')['u_urls'] with self.subTest(content=content): self.assertEqual(expected, actual)
def test_mf2py_crash_lxml(self, _): """https://github.com/tommorris/mf2py/issues/78""" got = maybe_convert(warc_record(warc_response('X', 'http://foo')), 'foo') self.assertEqual('{"x": "y"}', got['mf2'])
def test_maybe_convert_not_response(self): self.assertIsNone(maybe_convert(WARC_HEADER_RECORD, 'foo')) self.assertIsNone(maybe_convert(WARC_METADATA_RECORD, 'foo')) self.assertIsNone(maybe_convert(WARC_REQUEST_RECORD, 'foo'))