Exemplo n.º 1
0
    def test_download(self):
        tests = [
            {
                'label': 'valid url',
                'url': 'http://www.example.com',
                'expect': '<title>IANA &mdash; Example domains</title>',
            },
            {
                 # We're using openDNS that redirects invalid urls to notice
                 # page so it will return content.
                 'label': 'invalid url',
                 'url': 'http://www.xxxxxxxxxxfakeurlxxxxxxxxxxx.com',
                 'expect': None,
             }]

        for test in tests:
            da = DownloadAgent(url=test['url'])
            da.download()
            if test['expect']:
                self.assertTrue(da.content and len(da.content) > 0)
                p_title = re.compile('%s' % test['expect'])
                m = p_title.search(da.content)
                self.assertFalse(not m)
            else:
                self.assertTrue(not da.content)
        return
Exemplo n.º 2
0
    def get(
        self,
        cookie_jar=None,
        cookie=None,
        cookie_filename=None,
        referrer=None,
        request_headers=None,
        ):
        """Get the content of the webpage.

        Note:
            Content is stored in "content" property.

        Args:
            cookie_jar = cookielib.CookieJar object instance
            cookie: cookielib.Cookie object instance
            referrer: string, Url of referrer.

        Raises:
            DownloadError if web page download is unsuccessful
        """

        if not referrer:
            referrer = self.url

        if not cookie_jar and cookie_filename:
            cookie_jar = DynamicMozillaCookieJar(filename=cookie_filename)
            cookie_jar.create_file()

        agent = DownloadAgent(
            accept_encoding='gzip,deflate',
            cookie_jar=cookie_jar,
            cookie=cookie,
            post_data=self.post_data,
            referrer=referrer,
            request_headers=request_headers,
            retries=self.retries,
            url=self.url,
            )
        agent.download()

        if not agent.content:
            if agent.errors:
                raise DownloadError(messages=agent.errors)
        self.raw_content = agent.content

        # with open('/root/tmp/raw.htm', 'w') as f_dump:
        #    f_dump.write(self.raw_content + '\n')

        self.content = self.scrub_content()
        return