def test_requests_context(): context = RequestsContext(verify=False) with make_url(BLOB, make_md5(BLOB)) as url: assert context.read(Link.wrap(url)) == BLOB with make_url(BLOB, make_md5(BLOB)) as url: filename = context.fetch(Link.wrap(url)) with open(filename, 'rb') as fp: assert fp.read() == BLOB # test local reading with temporary_file() as tf: tf.write(b'goop') tf.flush() assert context.read(Link.wrap(tf.name)) == b'goop'
def test_urllib_context_utf8_encoding(): BYTES = b'this is a decoded utf8 string' with named_temporary_file() as tf: tf.write(BYTES) tf.flush() local_link = Link.wrap(tf.name) # Trick UrllibContext into thinking this is a remote link class MockUrllibContext(UrllibContext): def open(self, link): return super(MockUrllibContext, self).open(local_link) context = MockUrllibContext() assert context.content(Link.wrap('http://www.google.com')) == BYTES.decode( UrllibContext.DEFAULT_ENCODING)
def test_urllib_context_utf8_encoding(): BYTES = b'this is a decoded utf8 string' with named_temporary_file() as tf: tf.write(BYTES) tf.flush() local_link = Link.wrap(tf.name) # Trick UrllibContext into thinking this is a remote link class MockUrllibContext(UrllibContext): def open(self, link): return super(MockUrllibContext, self).open(local_link) context = MockUrllibContext() assert context.content(Link.wrap('http://www.google.com')) == BYTES.decode( UrllibContext.DEFAULT_ENCODING)
def test_requests_context(): context = RequestsContext(verify=False) with make_url(BLOB, make_md5(BLOB)) as url: assert context.read(Link.wrap(url)) == BLOB with make_url(BLOB, make_md5(BLOB)) as url: filename = context.fetch(Link.wrap(url)) with open(filename, 'rb') as fp: assert fp.read() == BLOB # test local reading with named_temporary_file() as tf: tf.write(b'goop') tf.flush() assert context.read(Link.wrap(tf.name)) == b'goop'
def test_link_wrapping(): link = Link.wrap("https://www.google.com") assert link.url == "https://www.google.com" link = Link.wrap(Link.wrap("https://www.google.com")) assert link.url == "https://www.google.com" with pytest.raises(ValueError): Link.wrap(1234) with pytest.raises(ValueError): Link.wrap_iterable(1234) links = Link.wrap_iterable("https://www.google.com") assert len(links) == 1 assert links[0].url == "https://www.google.com" links = Link.wrap_iterable(["https://www.google.com", Link("http://www.google.com")]) assert set(links) == set([Link("http://www.google.com"), Link("https://www.google.com")])
def test_requests_context_retries_connect_timeout_retries_exhausted(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect(num_timeouts=3) context = RequestsContext(verify=False, max_retries=2) with pytest.raises(Context.Error): context.read(Link.wrap(url))
def test_requests_context_retries_connect_timeout(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect() context = RequestsContext(verify=False) data = context.read(Link.wrap(url)) assert data == BLOB
def test_requests_context_retries_connect_timeout(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect() context = RequestsContext(verify=False) data = context.read(Link.wrap(url)) assert data == BLOB
def test_requests_context_retries_connect_timeout_retries_exhausted(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect(num_timeouts=3) env = Variables(environ={'PEX_HTTP_RETRIES': '2'}) context = RequestsContext(verify=False, env=env) with pytest.raises(Context.Error): context.read(Link.wrap(url))
def test_requests_context_retries_connect_timeout_retries_exhausted(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect( num_timeouts=3) context = RequestsContext(verify=False, max_retries=2) with pytest.raises(Context.Error): context.read(Link.wrap(url))
def test_requests_context_retries_connect_timeout_retries_exhausted(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect(num_timeouts=3) env = Variables(environ={'PEX_HTTP_RETRIES': '2'}) context = RequestsContext(verify=False, env=env) with pytest.raises(Context.Error): context.read(Link.wrap(url))
def test_crawler_local(): FL = ('a.txt', 'b.txt', 'c.txt') with temporary_dir() as td: for fn in FL: with open(os.path.join(td, fn), 'w'): pass for dn in (1, 2): os.mkdir(os.path.join(td, 'dir%d' % dn)) for fn in FL: with open(os.path.join(td, 'dir%d' % dn, fn), 'w'): pass # basic file / dir rel splitting links, rels = Crawler.crawl_local(Link.wrap(td)) assert set(links) == set(Link.wrap(os.path.join(td, fn)) for fn in FL) assert set(rels) == set(Link.wrap(os.path.join(td, 'dir%d' % n)) for n in (1, 2)) # recursive crawling, single vs multi-threaded for caching in (False, True): for threads in (1, 2, 3): links = Crawler(threads=threads).crawl([td], follow_links=True) expect_links = (set(Link.wrap(os.path.join(td, fn)) for fn in FL) | set(Link.wrap(os.path.join(td, 'dir1', fn)) for fn in FL) | set(Link.wrap(os.path.join(td, 'dir2', fn)) for fn in FL)) assert set(links) == expect_links
def test_crawler_local(): FL = ('a.txt', 'b.txt', 'c.txt') with temporary_dir() as td: for fn in FL: with open(os.path.join(td, fn), 'w'): pass for dn in (1, 2): os.mkdir(os.path.join(td, 'dir%d' % dn)) for fn in FL: with open(os.path.join(td, 'dir%d' % dn, fn), 'w'): pass # basic file / dir rel splitting links, rels = Crawler.crawl_local(Link.wrap(td)) assert set(links) == set(Link.wrap(os.path.join(td, fn)) for fn in FL) assert set(rels) == set( Link.wrap(os.path.join(td, 'dir%d' % n)) for n in (1, 2)) # recursive crawling, single vs multi-threaded for caching in (False, True): for threads in (1, 2, 3): links = Crawler(threads=threads).crawl([td], follow_links=True) expect_links = ( set(Link.wrap(os.path.join(td, fn)) for fn in FL) | set( Link.wrap(os.path.join(td, 'dir1', fn)) for fn in FL) | set(Link.wrap(os.path.join(td, 'dir2', fn)) for fn in FL)) assert set(links) == expect_links
def test_link_wrapping(): link = Link.wrap('https://www.google.com') assert link.url == 'https://www.google.com' link = Link.wrap(Link.wrap('https://www.google.com')) assert link.url == 'https://www.google.com' with pytest.raises(ValueError): Link.wrap(1234) with pytest.raises(ValueError): Link.wrap_iterable(1234) links = Link.wrap_iterable('https://www.google.com') assert len(links) == 1 assert links[0].url == 'https://www.google.com' links = Link.wrap_iterable(['https://www.google.com', Link('http://www.google.com')]) assert set(links) == set([ Link('http://www.google.com'), Link('https://www.google.com'), ])
def test_link_wrapping(): link = Link.wrap('https://www.google.com') assert link.url == 'https://www.google.com' link = Link.wrap(Link.wrap('https://www.google.com')) assert link.url == 'https://www.google.com' with pytest.raises(ValueError): Link.wrap(1234) with pytest.raises(ValueError): Link.wrap_iterable(1234) links = Link.wrap_iterable('https://www.google.com') assert len(links) == 1 assert links[0].url == 'https://www.google.com' links = Link.wrap_iterable(['https://www.google.com', Link('http://www.google.com')]) assert set(links) == set([ Link('http://www.google.com'), Link('https://www.google.com'), ])
def from_href(cls, href, **kw): """Convert from a url to Package. :param href: The url to parse :type href: string :returns: A Package object if a valid concrete implementation exists, otherwise None. """ package = cls._HREF_TO_PACKAGE_CACHE.get(href) if package is not None: return package link_href = Link.wrap(href) for package_type in cls._REGISTRY: try: package = package_type(link_href.url, **kw) break except package_type.InvalidPackage: continue if package is not None: cls._HREF_TO_PACKAGE_CACHE.store(href, package) return package
def test_stream_filelike_with_incorrect_md5(): with make_url(BLOB, 'f' * 32) as url: request = requests.get(url) filelike = StreamFilelike(request, Link.wrap(url)) with pytest.raises(Context.Error): filelike.read()
def test_stream_filelike_without_md5(): with make_url(BLOB) as url: request = requests.get(url) filelike = StreamFilelike(request, Link.wrap(url)) assert filelike.read() == BLOB
def test_stream_filelike_with_incorrect_md5(): with make_url(BLOB, 'f' * 32) as url: request = requests.get(url) filelike = StreamFilelike(request, Link.wrap(url)) with pytest.raises(Context.Error): filelike.read()
def test_stream_filelike_without_md5(): with make_url(BLOB) as url: request = requests.get(url) filelike = StreamFilelike(request, Link.wrap(url)) assert filelike.read() == BLOB