Exemplo n.º 1
0
    def test_links_priority(self, server):
        # Download links from the pypi simple index should be used before
        # external download links.
        # http://bitbucket.org/tarek/distribute/issue/163/md5-validation-error
        #
        # Usecase :
        # - someone uploads a package on pypi, a md5 is generated
        # - someone manually coindexes this link (with the md5 in the url) onto
        #   an external page accessible from the package page.
        # - someone reuploads the package (with a different md5)
        # - while easy_installing, an MD5 error occurs because the external
        # link is used
        # -> The index should use the link from pypi, not the external one.

        # start an index server
        index_url = server.full_address + '/simple/'

        # scan a test index
        crawler = Crawler(index_url, follow_externals=True)
        releases = crawler.get_releases("foobar")
        server.stop()

        # we have only one link, because links are compared without md5
        self.assertEqual(1, len(releases))
        self.assertEqual(1, len(releases[0].dists))
        # the link should be from the index
        self.assertEqual(2, len(releases[0].dists['sdist'].urls))
        self.assertEqual('12345678901234567',
                         releases[0].dists['sdist'].url['hashval'])
        self.assertEqual('md5', releases[0].dists['sdist'].url['hashname'])
Exemplo n.º 2
0
    def test_links_priority(self, server):
        # Download links from the pypi simple index should be used before
        # external download links.
        # http://bitbucket.org/tarek/distribute/issue/163/md5-validation-error
        #
        # Usecase :
        # - someone uploads a package on pypi, a md5 is generated
        # - someone manually coindexes this link (with the md5 in the url) onto
        #   an external page accessible from the package page.
        # - someone reuploads the package (with a different md5)
        # - while easy_installing, an MD5 error occurs because the external
        # link is used
        # -> The index should use the link from pypi, not the external one.

        # start an index server
        index_url = server.full_address + '/simple/'

        # scan a test index
        crawler = Crawler(index_url, follow_externals=True)
        releases = crawler.get_releases("foobar")
        server.stop()

        # we have only one link, because links are compared without md5
        self.assertEqual(1, len(releases))
        self.assertEqual(1, len(releases[0].dists))
        # the link should be from the index
        self.assertEqual(2, len(releases[0].dists['sdist'].urls))
        self.assertEqual('12345678901234567',
                         releases[0].dists['sdist'].url['hashval'])
        self.assertEqual('md5', releases[0].dists['sdist'].url['hashname'])
Exemplo n.º 3
0
 def test_bad_urls(self, server):
     crawler = Crawler()
     url = 'http://127.0.0.1:0/nonesuch/test_simple'
     try:
         v = crawler._open_url(url)
     except Exception, v:
         self.assertIn(url, str(v))
Exemplo n.º 4
0
 def test_get_link_matcher(self):
     crawler = Crawler("http://example.org")
     self.assertEqual(
         '_simple_link_matcher',
         crawler._get_link_matcher("http://example.org/some/file").__name__)
     self.assertEqual(
         '_default_link_matcher',
         crawler._get_link_matcher("http://other-url").__name__)
Exemplo n.º 5
0
    def test_browse_local_files(self):
        # Test that we can browse local files"""
        index_url = "file://" + PYPI_DEFAULT_STATIC_PATH
        if sys.platform == 'win32':
            # under windows the correct syntax is:
            #   file:///C|\the\path\here
            # instead of
            #   file://C:\the\path\here
            fix = re.compile(r'^(file://)([A-Za-z])(:)')
            index_url = fix.sub('\\1/\\2|', index_url)

        index_path = os.sep.join([index_url, "test_found_links", "simple"])
        crawler = Crawler(index_path)
        dists = crawler.get_releases("foobar")
        self.assertEqual(4, len(dists))
Exemplo n.º 6
0
    def test_default_link_matcher(self):
        crawler = Crawler("http://example.org", mirrors=[])
        crawler.follow_externals = True
        crawler._is_browsable = lambda *args: True
        base_url = "http://example.org/some/file/"
        content = """
<a href="../homepage" rel="homepage">link</a>
<a href="../download" rel="download">link2</a>
<a href="../simpleurl">link2</a>
        """
        found_links = set(
            uri for uri, _ in crawler._default_link_matcher(content, base_url))
        self.assertIn('http://example.org/some/homepage', found_links)
        self.assertIn('http://example.org/some/simpleurl', found_links)
        self.assertIn('http://example.org/some/download', found_links)
Exemplo n.º 7
0
    def test_browse_local_files(self):
        # Test that we can browse local files"""
        index_url = "file://" + PYPI_DEFAULT_STATIC_PATH
        if sys.platform == 'win32':
            # under windows the correct syntax is:
            #   file:///C|\the\path\here
            # instead of
            #   file://C:\the\path\here
            fix = re.compile(r'^(file://)([A-Za-z])(:)')
            index_url = fix.sub('\\1/\\2|', index_url)

        index_path = os.sep.join([index_url, "test_found_links", "simple"])
        crawler = Crawler(index_path)
        dists = crawler.get_releases("foobar")
        self.assertEqual(4, len(dists))
Exemplo n.º 8
0
    def test_default_link_matcher(self):
        crawler = Crawler("http://example.org", mirrors=[])
        crawler.follow_externals = True
        crawler._is_browsable = lambda *args: True
        base_url = "http://example.org/some/file/"
        content = """
<a href="../homepage" rel="homepage">link</a>
<a href="../download" rel="download">link2</a>
<a href="../simpleurl">link2</a>
        """
        found_links = set(uri for uri, _ in
                          crawler._default_link_matcher(content, base_url))
        self.assertIn('http://example.org/some/homepage', found_links)
        self.assertIn('http://example.org/some/simpleurl', found_links)
        self.assertIn('http://example.org/some/download', found_links)
Exemplo n.º 9
0
    def test_uses_mirrors(self):
        # When the main repository seems down, try using the given mirrors"""
        server = PyPIServer("foo_bar_baz")
        mirror = PyPIServer("foo_bar_baz")
        mirror.start()  # we dont start the server here

        try:
            # create the index using both servers
            crawler = Crawler(server.full_address + "/simple/", hosts=('*',),
                              # set the timeout to 1s for the tests
                              timeout=1, mirrors=[mirror.full_address])

            # this should not raise a timeout
            self.assertEqual(4, len(crawler.get_releases("foo")))
        finally:
            mirror.stop()
            server.stop()
Exemplo n.º 10
0
 def _get_simple_crawler(self,
                         server,
                         base_url="/simple/",
                         hosts=None,
                         *args,
                         **kwargs):
     """Build and return a SimpleIndex with the test server urls"""
     if hosts is None:
         hosts = (server.full_address.replace("http://", ""), )
     kwargs['hosts'] = hosts
     return Crawler(server.full_address + base_url, *args, **kwargs)
Exemplo n.º 11
0
    def test_uses_mirrors(self):
        # When the main repository seems down, try using the given mirrors"""
        server = PyPIServer("foo_bar_baz")
        mirror = PyPIServer("foo_bar_baz")
        mirror.start()  # we dont start the server here

        try:
            # create the index using both servers
            crawler = Crawler(
                server.full_address + "/simple/",
                hosts=('*', ),
                # set the timeout to 1s for the tests
                timeout=1,
                mirrors=[mirror.full_address])

            # this should not raise a timeout
            self.assertEqual(4, len(crawler.get_releases("foo")))
        finally:
            mirror.stop()
            server.stop()
Exemplo n.º 12
0
    def test_simple_link_matcher(self):
        # Test that the simple link matcher finds the right links"""
        crawler = Crawler(follow_externals=False)

        # Here, we define:
        #   1. one link that must be followed, cause it's a download one
        #   2. one link that must *not* be followed, cause the is_browsable
        #      returns false for it.
        #   3. one link that must be followed cause it's a homepage that is
        #      browsable
        #   4. one link that must be followed, because it contain a md5 hash
        self.assertTrue(crawler._is_browsable("%stest" % crawler.index_url))
        self.assertFalse(crawler._is_browsable("http://dl-link2"))
        content = """
        <a href="http://dl-link1" rel="download">download_link1</a>
        <a href="http://dl-link2" rel="homepage">homepage_link1</a>
        <a href="%(index_url)stest" rel="homepage">homepage_link2</a>
        <a href="%(index_url)stest/foobar-1.tar.gz#md5=abcdef>download_link2</a>
        """ % {
            'index_url': crawler.index_url
        }

        # Test that the simple link matcher yield the good links.
        generator = crawler._simple_link_matcher(content, crawler.index_url)
        self.assertEqual(
            ('%stest/foobar-1.tar.gz#md5=abcdef' % crawler.index_url, True),
            next(generator))
        self.assertEqual(('http://dl-link1', True), next(generator))
        self.assertEqual(('%stest' % crawler.index_url, False),
                         next(generator))
        self.assertRaises(StopIteration, generator.__next__)

        # Follow the external links is possible (eg. homepages)
        crawler.follow_externals = True
        generator = crawler._simple_link_matcher(content, crawler.index_url)
        self.assertEqual(
            ('%stest/foobar-1.tar.gz#md5=abcdef' % crawler.index_url, True),
            next(generator))
        self.assertEqual(('http://dl-link1', True), next(generator))
        self.assertEqual(('http://dl-link2', False), next(generator))
        self.assertEqual(('%stest' % crawler.index_url, False),
                         next(generator))
        self.assertRaises(StopIteration, generator.__next__)
Exemplo n.º 13
0
    def test_simple_link_matcher(self):
        # Test that the simple link matcher finds the right links"""
        crawler = Crawler(follow_externals=False)

        # Here, we define:
        #   1. one link that must be followed, cause it's a download one
        #   2. one link that must *not* be followed, cause the is_browsable
        #      returns false for it.
        #   3. one link that must be followed cause it's a homepage that is
        #      browsable
        #   4. one link that must be followed, because it contain a md5 hash
        self.assertTrue(crawler._is_browsable("%stest" % crawler.index_url))
        self.assertFalse(crawler._is_browsable("http://dl-link2"))
        content = """
        <a href="http://dl-link1" rel="download">download_link1</a>
        <a href="http://dl-link2" rel="homepage">homepage_link1</a>
        <a href="%(index_url)stest" rel="homepage">homepage_link2</a>
        <a href="%(index_url)stest/foobar-1.tar.gz#md5=abcdef>download_link2</a>
        """ % {'index_url': crawler.index_url}

        # Test that the simple link matcher yield the good links.
        generator = crawler._simple_link_matcher(content, crawler.index_url)
        self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' %
                          crawler.index_url, True), generator.next())
        self.assertEqual(('http://dl-link1', True), generator.next())
        self.assertEqual(('%stest' % crawler.index_url, False),
                         generator.next())
        self.assertRaises(StopIteration, generator.next)

        # Follow the external links is possible (eg. homepages)
        crawler.follow_externals = True
        generator = crawler._simple_link_matcher(content, crawler.index_url)
        self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' %
                          crawler.index_url, True), generator.next())
        self.assertEqual(('http://dl-link1', True), generator.next())
        self.assertEqual(('http://dl-link2', False), generator.next())
        self.assertEqual(('%stest' % crawler.index_url, False),
                         generator.next())
        self.assertRaises(StopIteration, generator.next)
Exemplo n.º 14
0
 def test_get_link_matcher(self):
     crawler = Crawler("http://example.org")
     self.assertEqual('_simple_link_matcher', crawler._get_link_matcher(
                      "http://example.org/some/file").__name__)
     self.assertEqual('_default_link_matcher', crawler._get_link_matcher(
                      "http://other-url").__name__)
Exemplo n.º 15
0
    def test_is_browsable(self):
        crawler = Crawler(follow_externals=False)
        self.assertTrue(crawler._is_browsable(crawler.index_url + "test"))

        # Now, when following externals, we can have a list of hosts to trust.
        # and don't follow other external links than the one described here.
        crawler = Crawler(hosts=["pypi.python.org", "example.org"],
                          follow_externals=True)
        good_urls = (
            "http://pypi.python.org/foo/bar",
            "http://pypi.python.org/simple/foobar",
            "http://example.org",
            "http://example.org/",
            "http://example.org/simple/",
        )
        bad_urls = (
            "http://python.org",
            "http://example.tld",
        )

        for url in good_urls:
            self.assertTrue(crawler._is_browsable(url))

        for url in bad_urls:
            self.assertFalse(crawler._is_browsable(url))

        # allow all hosts
        crawler = Crawler(follow_externals=True, hosts=("*",))
        self.assertTrue(crawler._is_browsable("http://an-external.link/path"))
        self.assertTrue(crawler._is_browsable("pypi.example.org/a/path"))

        # specify a list of hosts we want to allow
        crawler = Crawler(follow_externals=True,
                          hosts=("*.example.org",))
        self.assertFalse(crawler._is_browsable("http://an-external.link/path"))
        self.assertTrue(
            crawler._is_browsable("http://pypi.example.org/a/path"))
Exemplo n.º 16
0
    def test_bad_urls(self, server):
        crawler = Crawler()
        url = 'http://127.0.0.1:0/nonesuch/test_simple'
        try:
            v = crawler._open_url(url)
        except Exception as v:
            self.assertIn(url, str(v))
        else:
            v.close()
            self.assertIsInstance(v, urllib.error.HTTPError)

        # issue 16
        # easy_install inquant.contentmirror.plone breaks because of a typo
        # in its home URL
        crawler = Crawler(hosts=('example.org', ))
        url = ('url:%20https://svn.plone.org/svn/collective/'
               'inquant.contentmirror.plone/trunk')
        try:
            v = crawler._open_url(url)
        except Exception as v:
            self.assertIn(url, str(v))
        else:
            v.close()
            self.assertIsInstance(v, urllib.error.HTTPError)

        def _urlopen(*args):
            raise http.client.BadStatusLine('line')

        old_urlopen = urllib.request.urlopen
        urllib.request.urlopen = _urlopen
        url = 'http://example.org'
        try:
            v = crawler._open_url(url)
        except Exception as v:
            self.assertIn('line', str(v))
        else:
            v.close()
            # TODO use self.assertRaises
            raise AssertionError('Should have raise here!')
        finally:
            urllib.request.urlopen = old_urlopen

        # issue 20
        url = 'http://http://svn.pythonpaste.org/Paste/wphp/trunk'
        try:
            crawler._open_url(url)
        except Exception as v:
            if sys.version_info[:3] < (3, 2, 3):  # XXX check versions again
                wanted = 'nonnumeric port'
            else:
                wanted = 'Download error'
            self.assertIn(wanted, str(v))

        # issue #160
        url = server.full_address
        page = ('<a href="http://www.famfamfam.com]('
                'http://www.famfamfam.com/">')
        crawler._process_url(url, page)
Exemplo n.º 17
0
    @use_pypi_server()
    def test_bad_urls(self, server):
        crawler = Crawler()
        url = 'http://127.0.0.1:0/nonesuch/test_simple'
        try:
            v = crawler._open_url(url)
        except Exception, v:
            self.assertIn(url, str(v))
        else:
            v.close()
            self.assertIsInstance(v, urllib2.HTTPError)

        # issue 16
        # easy_install inquant.contentmirror.plone breaks because of a typo
        # in its home URL
        crawler = Crawler(hosts=('example.org',))
        url = ('url:%20https://svn.plone.org/svn/collective/'
               'inquant.contentmirror.plone/trunk')
        try:
            v = crawler._open_url(url)
        except Exception, v:
            self.assertIn(url, str(v))
        else:
            v.close()
            self.assertIsInstance(v, urllib2.HTTPError)

        def _urlopen(*args):
            raise httplib.BadStatusLine('line')

        old_urlopen = urllib2.urlopen
        urllib2.urlopen = _urlopen
Exemplo n.º 18
0
    def test_is_browsable(self):
        crawler = Crawler(follow_externals=False)
        self.assertTrue(crawler._is_browsable(crawler.index_url + "test"))

        # Now, when following externals, we can have a list of hosts to trust.
        # and don't follow other external links than the one described here.
        crawler = Crawler(hosts=["pypi.python.org", "example.org"],
                          follow_externals=True)
        good_urls = (
            "http://pypi.python.org/foo/bar",
            "http://pypi.python.org/simple/foobar",
            "http://example.org",
            "http://example.org/",
            "http://example.org/simple/",
        )
        bad_urls = (
            "http://python.org",
            "http://example.tld",
        )

        for url in good_urls:
            self.assertTrue(crawler._is_browsable(url))

        for url in bad_urls:
            self.assertFalse(crawler._is_browsable(url))

        # allow all hosts
        crawler = Crawler(follow_externals=True, hosts=("*", ))
        self.assertTrue(crawler._is_browsable("http://an-external.link/path"))
        self.assertTrue(crawler._is_browsable("pypi.example.org/a/path"))

        # specify a list of hosts we want to allow
        crawler = Crawler(follow_externals=True, hosts=("*.example.org", ))
        self.assertFalse(crawler._is_browsable("http://an-external.link/path"))
        self.assertTrue(
            crawler._is_browsable("http://pypi.example.org/a/path"))