def process_filtered_content(self, data, file_extension, mode): """ processing assets data. Starts uploading file content @param data: http response dictionary @param file_extension: asset extension """ asset = data['content'] if mode == CrawlerMode.UPDATE: self.upload_asset(asset, Url(data['url'])) self.visited.add(data['url']) return if self.check_asset_size(data['headers']['Content-Length'], file_extension): self.upload_asset(asset, Url(data['url'])) self.visited.add(data['url']) return logging.info(f' Asset {data["url"]} large then available')
def process_html_content(self, data, mode): """ processing html data. Extract links and upload content @param data: http response dictionary """ content = data['content'].decode(data['encoding']) if mode == CrawlerMode.DOWNLOAD: self.add_parsed_links_in_queue( self.PageParser.get_filtered_links(content)) self.upload_page(content, Url(data['url'])) self.visited.add(data['url'])
def __init__(self, str_url, folder, depth, max_threads, filters, state_handler): """ the constructor @param str_url: string URL @param folder: folder for saving downloaded pages @param depth: maximal depth @param max_threads: maximum number of threads @param state_handler: program state handler """ self.general_url = Url(str_url) self.folder = folder self.max_depth = depth self.CHUNK_SIZE = 1024 self.current_depth = 0 self.workers = max_threads self.visited = set() self.url_queue = [] self.filters = filters self.PageParser = PageParser(self.general_url, self.visited) self.StateHandler = state_handler self.RobotsHandler = robotparser.RobotFileParser() self.HTTPClient = HTTPClient(2) self.FileSystemHandler = FileSystemHandler()
"""@package test_page Documentation for test_page module. Module responsible for testing util modules. """ import os import unittest from urllib.parse import urljoin from modules.PageParser import PageParser from modules.TerminalParser import TerminalParser from modules.Url import Url ROOT_LINK = "https://www.test.com" pageparser = PageParser(Url("https://www.test.com"), set()) PATH = os.path.dirname(__file__) class PageTest(unittest.TestCase): """ unit tests """ def test_terminal_parser_weburl(self): t = TerminalParser() urls = ['htt://test.org', 'test.org', 'https:/test.org', 'htt://test.org'] for i in urls: with self.subTest(i=i): self.assertRaises(ValueError, lambda: t.verify_wed_url(i))
def test_simple_url(self): URL = 'https://anytask.org' url = Url(URL) self.assertEqual(url.filename, 'index.html')
def test_complex_url(self): URL = 'https://tproger.ru/translations/regular-expression-python/' url = Url(URL) self.assertEqual(url.dirname, '/translations/regular-expression-python')
def test_img_url(self): URL = 'https://s3.tproger.ru/uploads/2020/07/xsolla-50x50.png' url = Url(URL) self.assertEqual(url.filename, 'xsolla-50x50.png')