def testDownloading_css_03(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = 'testdata/webpage/Пример 3/' exampleHtmlPath = os.path.join(examplePath, 'пример 3.html') downloader.start(self._path2url(exampleHtmlPath), controller) downloadDir = os.path.join(self._tempDir, self._staticDirName) fname1 = os.path.join(self._tempDir, self._staticDirName, 'fname1.css') fname2 = os.path.join(self._tempDir, self._staticDirName, 'fname2.css') fname3 = os.path.join(self._tempDir, self._staticDirName, 'fname3.css') fname4 = os.path.join(self._tempDir, self._staticDirName, 'fname4.css') fname5 = os.path.join(self._tempDir, self._staticDirName, 'fname1_1.css') self.assertTrue(os.path.exists(downloadDir)) self.assertTrue(os.path.exists(fname1)) self.assertTrue(os.path.exists(fname2)) self.assertTrue(os.path.exists(fname3)) self.assertTrue(os.path.exists(fname4)) self.assertTrue(os.path.exists(fname5))
def testDownloading_favicon(self): from webpage.downloader import Downloader, DownloadController template = 'href="{path}"' downloadDir = os.path.join(self._tempDir, self._staticDirName) controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = 'testdata/webpage/example_favicon/' exampleHtmlPath = os.path.join(examplePath, 'example.html') downloader.start(self._path2url(exampleHtmlPath), controller) fname_1 = os.path.join(self._tempDir, self._staticDirName, 'favicon_1.png') fname_2 = os.path.join(self._tempDir, self._staticDirName, 'favicon_2.png') self.assertTrue(os.path.exists(downloadDir)) self.assertTrue(os.path.exists(fname_1)) self.assertTrue(os.path.exists(fname_2)) self.assertIn( template.format(path=self._staticDirName + '/favicon_1.png'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + '/favicon_2.png'), downloader.contentResult)
def testDownloading_img_02(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example2/' exampleHtmlPath = os.path.join(examplePath, u'example2.html') downloader.start(self._path2url(exampleHtmlPath), controller) downloadDir = os.path.join(self._tempDir, self._staticDirName) fname1 = os.path.join(self._tempDir, self._staticDirName, u'image_01.png') fname2 = os.path.join(self._tempDir, self._staticDirName, u'image_02.png') fname3 = os.path.join(self._tempDir, self._staticDirName, u'image_03.png') self.assertTrue(os.path.exists(downloadDir)) self.assertTrue(os.path.exists(fname1)) self.assertTrue(os.path.exists(fname2)) self.assertTrue(os.path.exists(fname3))
def testDownloading_javascript_01(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example1/' exampleHtmlPath = os.path.join(examplePath, u'example1.html') downloader.start(self._path2url(exampleHtmlPath), controller) downloadDir = os.path.join(self._tempDir, self._staticDirName) fname1 = os.path.join(self._tempDir, self._staticDirName, u'fname1.js') fname2 = os.path.join(self._tempDir, self._staticDirName, u'fname2.js') fname3 = os.path.join(self._tempDir, self._staticDirName, u'fname3.js') fname4 = os.path.join(self._tempDir, self._staticDirName, u'fname4.js') self.assertTrue(os.path.exists(downloadDir)) self.assertTrue(os.path.exists(fname1)) self.assertTrue(os.path.exists(fname2)) self.assertTrue(os.path.exists(fname3)) self.assertTrue(os.path.exists(fname4))
def testDownloading_css_import_01(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example1/' exampleHtmlPath = os.path.join(examplePath, u'example1.html') downloader.start(self._path2url(exampleHtmlPath), controller) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'import1.css'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'import2.css'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'import3.css'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'import4.css'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'basic2.css'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'basic3.css'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'basic4.css'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'basic5.css'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'basic5_1.css'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'basic6.css')))
def testDownloading_beautifulsoup(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() url = 'http://www.crummy.com/software/BeautifulSoup/bs4/doc/' downloader.start(url, controller) self.assertTrue(downloader.success) downloadDir = os.path.join(self._tempDir, self._staticDirName) self.assertTrue(os.path.exists(downloadDir)) self.assertTrue( os.path.join(self._tempDir, self._staticDirName, 'default.css')) self.assertTrue( os.path.join(self._tempDir, self._staticDirName, 'pygments.css')) self.assertTrue( os.path.join(self._tempDir, self._staticDirName, 'jquery.js')) self.assertTrue( os.path.join(self._tempDir, self._staticDirName, 'underscore.js')) self.assertTrue( os.path.join(self._tempDir, self._staticDirName, 'doctools.js'))
def testContentScriptExample1(self): from webpage.downloader import Downloader, DownloadController template = '<script src="{path}"' controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = 'testdata/webpage/example1/' exampleHtmlPath = os.path.join(examplePath, 'example1.html') downloader.start(self._path2url(exampleHtmlPath), controller) self.assertIn(template.format(path=self._staticDirName + '/fname1.js'), downloader.contentResult) self.assertIn(template.format(path=self._staticDirName + '/fname2.js'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + '/fname2_1.js'), downloader.contentResult) self.assertIn(template.format(path=self._staticDirName + '/fname3.js'), downloader.contentResult) self.assertIn(template.format(path=self._staticDirName + '/fname4.js'), downloader.contentResult) self.assertNotIn( template.format(path=self._staticDirName + '/fname1_1.js'), downloader.contentResult)
def run(self): controller = WebPageDownloadController(self._runEvent, self._downloadDir, STATIC_DIR_NAME, self._parentWnd, self._timeout) downloader = Downloader(self._timeout) self._log(_('Start downloading\n')) try: downloader.start(self._url, controller) except urllib.error.URLError as error: self._error(_('Download error: {}\n').format(str(error.reason))) except (IOError, ValueError) as e: self._error(_('Invalid URL or file format\n')) self._error(str(e)) else: self._log(_('Finish downloading\n')) content = downloader.contentResult staticPath = os.path.join(self._downloadDir, STATIC_DIR_NAME) title = downloader.pageTitle favicon = self._prepareFavicon(downloader.favicon) finishEvent = webpage.events.FinishDownloadEvent( content=content, staticPath=staticPath, title=title, favicon=favicon, url=self._url) wx.PostEvent(self._parentWnd, finishEvent)
def testContentExample2(self): from webpage.downloader import Downloader, DownloadController template = u'<img src="{path}"' controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example2/' exampleHtmlPath = os.path.join(examplePath, u'example2.html') downloader.start(self._path2url(exampleHtmlPath), controller) self.assertIn( template.format(path=self._staticDirName + u'/image_01.png'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + u'/image_01_1.png'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + u'/image_02.png'), downloader.contentResult) self.assertNotIn( template.format(path=self._staticDirName + u'/image_02_1.png'), downloader.contentResult)
def testDownloading_img_srcset_files(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = 'testdata/webpage/example3/' exampleHtmlPath = os.path.join(examplePath, 'example3.html') downloader.start(self._path2url(exampleHtmlPath), controller) downloadDir = os.path.join(self._tempDir, self._staticDirName) fname1 = os.path.join(self._tempDir, self._staticDirName, 'image_01.png') fname2 = os.path.join(self._tempDir, self._staticDirName, 'image_02.png') fname3 = os.path.join(self._tempDir, self._staticDirName, 'image_03.png') fname4 = os.path.join(self._tempDir, self._staticDirName, 'image_04.png') self.assertTrue(os.path.exists(downloadDir)) self.assertTrue(os.path.exists(fname1)) self.assertTrue(os.path.exists(fname2)) self.assertTrue(os.path.exists(fname3)) self.assertTrue(os.path.exists(fname4))
def testDownloading_stackoverflow_2(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() url = 'https://ru.stackoverflow.com/questions/241337/Как-обработать-кириллические-символы-в-urllib-request-urlopen' downloader.start(url, controller) self.assertTrue(downloader.success)
def testDownloading_stackoverflow_01(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() url = 'http://ru.stackoverflow.com/questions/476918/django-%D0%97%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B5-%D0%B2-%D0%B7%D0%B0%D0%B2%D0%B8%D1%81%D0%B8%D0%BC%D0%BE%D1%81%D1%82%D0%B8-%D0%BE%D1%82-%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B9-%D0%B2-%D0%91%D0%94' downloader.start(url, controller) self.assertTrue(downloader.success)
def testDownloading_toster(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() url = 'https://toster.ru/q/273244' downloader.start(url, controller) self.assertTrue(downloader.success)
def testNoTitle(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example_no_title/' exampleHtmlPath = os.path.join(examplePath, u'example_no_title.html') downloader.start(self._path2url(exampleHtmlPath), controller) self.assertTrue(downloader.success) self.assertIsNone(downloader.pageTitle)
def testTitleExample1(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example1/' exampleHtmlPath = os.path.join(examplePath, u'example1.html') downloader.start(self._path2url(exampleHtmlPath), controller) self.assertTrue(downloader.success) self.assertEqual(downloader.pageTitle, u'Заголовок страницы')
def testDownloading_img_srcset_content(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = '../test/webpage/example3/' exampleHtmlPath = os.path.join(examplePath, 'example3.html') downloader.start(self._path2url(exampleHtmlPath), controller) downloadDir = os.path.join(self._tempDir, self._staticDirName) content = downloader.contentResult sample = 'srcset="{path}/image_02.png 2x, {path}/image_03.png w600, {path}/image_04.png"'.format(path=self._staticDirName) self.assertIn(sample, content)
def testDownloading_favicon_03(self): from webpage.downloader import Downloader, DownloadController downloadDir = os.path.join(self._tempDir, self._staticDirName) controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = 'testdata/webpage/example_favicon_03/' exampleHtmlPath = os.path.join(examplePath, 'example.html') downloader.start(self._path2url(exampleHtmlPath), controller) favicon_fname = os.path.join( self._tempDir, self._staticDirName, 'favicon.ico') self.assertTrue(os.path.exists(downloadDir)) self.assertEqual(controller.favicon, os.path.join(self._tempDir, self._staticDirName) + '/favicon.ico') self.assertTrue(os.path.exists(favicon_fname))
def testDownloading_css_back_img_01(self): from webpage.downloader import Downloader, DownloadController controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example1/' exampleHtmlPath = os.path.join(examplePath, u'example1.html') downloader.start(self._path2url(exampleHtmlPath), controller) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'back_img_01.png'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'back_img_02.png'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'back_img_03.png'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'back_img_04.png'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'back_img_05.png'))) self.assertTrue( os.path.exists( os.path.join(self._tempDir, self._staticDirName, u'back_img_06.png')))
def testContentImgExample1(self): from webpage.downloader import Downloader, DownloadController template = '<img src="{path}"' controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = 'testdata/webpage/example1/' exampleHtmlPath = os.path.join(examplePath, 'example1.html') downloader.start(self._path2url(exampleHtmlPath), controller) self.assertIn( template.format(path=self._staticDirName + '/image_01.png'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + '/картинка.png'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + '/image_01_1.png'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + '/image_02.png'), downloader.contentResult) self.assertNotIn( template.format(path=self._staticDirName + '/image_02_1.png'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + '/image_03.png'), downloader.contentResult) self.assertNotIn( template.format(path=self._staticDirName + '/image_03_1.png'), downloader.contentResult)
def testDownloading_css_url_02(self): from webpage.downloader import Downloader, DownloadController template = u'url("{url}")' controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example1/' exampleHtmlPath = os.path.join(examplePath, u'example1.html') downloader.start(self._path2url(exampleHtmlPath), controller) fname2_text = readTextFile( os.path.join(self._tempDir, self._staticDirName, u'fname2.css')) self.assertIn(template.format(url=u'basic2.css'), fname2_text) self.assertIn(template.format(url=u'basic4.css'), fname2_text) self.assertIn(template.format(url=u'basic5.css'), fname2_text) self.assertIn(template.format(url=u'basic6.css'), fname2_text) self.assertIn('basic3.css', fname2_text) self.assertIn('basic5.css', fname2_text)
def testDownloading_css_rename(self): from webpage.downloader import Downloader, DownloadController template = 'href="{path}"' downloadDir = os.path.join(self._tempDir, self._staticDirName) controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = '../test/webpage/example_css_rename/' exampleHtmlPath = os.path.join(examplePath, 'example.html') downloader.start(self._path2url(exampleHtmlPath), controller) fname = os.path.join(self._tempDir, self._staticDirName, 'style.php.css') self.assertTrue(os.path.exists(downloadDir)) self.assertTrue(os.path.exists(fname)) self.assertIn( template.format(path=self._staticDirName + '/style.php.css'), downloader.contentResult)
def testDownloading_img_urlquote(self): from webpage.downloader import Downloader, DownloadController template = '<img src="{path}"' controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = 'testdata/webpage/example_urlquote/' exampleHtmlPath = os.path.join(examplePath, 'example_urlquote.html') downloader.start(self._path2url(exampleHtmlPath), controller) downloadDir = os.path.join(self._tempDir, self._staticDirName) fname = os.path.join(self._tempDir, self._staticDirName, 'рисунок.png') self.assertTrue(os.path.exists(downloadDir)) self.assertTrue(os.path.exists(fname)) self.assertIn( template.format(path=self._staticDirName + '/рисунок.png'), downloader.contentResult)
def testDownloading_css_url_01(self): from webpage.downloader import Downloader, DownloadController template = u'url("{url}")' controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example1/' exampleHtmlPath = os.path.join(examplePath, u'example1.html') downloader.start(self._path2url(exampleHtmlPath), controller) fname1_text = readTextFile( os.path.join(self._tempDir, self._staticDirName, u'fname1.css')) self.assertIn(template.format(url=u'import1.css'), fname1_text) self.assertIn(template.format(url=u'back_img_01.png'), fname1_text) self.assertIn(template.format(url=u'back_img_02.png'), fname1_text) self.assertIn(template.format(url=u'back_img_03.png'), fname1_text) self.assertIn(template.format(url=u'back_img_04.png'), fname1_text) self.assertIn(template.format(url=u'back_img_05.png'), fname1_text) self.assertIn(template.format(url=u'back_img_06.png'), fname1_text)
def testContentCSSExample1_01(self): from webpage.downloader import Downloader, DownloadController template = u'<link href="{path}"' controller = DownloadController(self._tempDir, self._staticDirName) downloader = Downloader() examplePath = u'../test/webpage/example1/' exampleHtmlPath = os.path.join(examplePath, u'example1.html') downloader.start(self._path2url(exampleHtmlPath), controller) self.assertIn( template.format(path=self._staticDirName + u'/fname1.css'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + u'/fname2.css'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + u'/fname3.css'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + u'/fname4.css'), downloader.contentResult) self.assertIn( template.format(path=self._staticDirName + u'/fname1_1.css'), downloader.contentResult) self.assertNotIn( template.format(path=self._staticDirName + u'/fname2_1.css'), downloader.contentResult)