Пример #1
0
 def process_filtered_content(self, data, file_extension, mode):
     """
     processing assets data. Starts uploading file content
     @param data: http response dictionary
     @param file_extension: asset extension
     """
     asset = data['content']
     if mode == CrawlerMode.UPDATE:
         self.upload_asset(asset, Url(data['url']))
         self.visited.add(data['url'])
         return
     if self.check_asset_size(data['headers']['Content-Length'],
                              file_extension):
         self.upload_asset(asset, Url(data['url']))
         self.visited.add(data['url'])
         return
     logging.info(f' Asset {data["url"]} large then available')
Пример #2
0
 def process_html_content(self, data, mode):
     """
     processing html data. Extract links and upload content
     @param data: http response dictionary
     """
     content = data['content'].decode(data['encoding'])
     if mode == CrawlerMode.DOWNLOAD:
         self.add_parsed_links_in_queue(
             self.PageParser.get_filtered_links(content))
     self.upload_page(content, Url(data['url']))
     self.visited.add(data['url'])
Пример #3
0
 def __init__(self, str_url, folder, depth, max_threads, filters,
              state_handler):
     """
     the constructor
     @param str_url: string URL
     @param folder: folder for saving downloaded pages
     @param depth: maximal depth
     @param max_threads: maximum number of threads
     @param state_handler: program state handler
     """
     self.general_url = Url(str_url)
     self.folder = folder
     self.max_depth = depth
     self.CHUNK_SIZE = 1024
     self.current_depth = 0
     self.workers = max_threads
     self.visited = set()
     self.url_queue = []
     self.filters = filters
     self.PageParser = PageParser(self.general_url, self.visited)
     self.StateHandler = state_handler
     self.RobotsHandler = robotparser.RobotFileParser()
     self.HTTPClient = HTTPClient(2)
     self.FileSystemHandler = FileSystemHandler()
Пример #4
0
"""@package test_page
Documentation for test_page module.

Module responsible for testing util modules.
"""
import os
import unittest
from urllib.parse import urljoin

from modules.PageParser import PageParser
from modules.TerminalParser import TerminalParser
from modules.Url import Url

ROOT_LINK = "https://www.test.com"
pageparser = PageParser(Url("https://www.test.com"), set())
PATH = os.path.dirname(__file__)


class PageTest(unittest.TestCase):
    """
    unit tests
    """
    def test_terminal_parser_weburl(self):
        t = TerminalParser()
        urls = ['htt://test.org',
                'test.org',
                'https:/test.org',
                'htt://test.org']
        for i in urls:
            with self.subTest(i=i):
                self.assertRaises(ValueError, lambda: t.verify_wed_url(i))
Пример #5
0
 def test_simple_url(self):
     URL = 'https://anytask.org'
     url = Url(URL)
     self.assertEqual(url.filename, 'index.html')
Пример #6
0
 def test_complex_url(self):
     URL = 'https://tproger.ru/translations/regular-expression-python/'
     url = Url(URL)
     self.assertEqual(url.dirname,
                      '/translations/regular-expression-python')
Пример #7
0
 def test_img_url(self):
     URL = 'https://s3.tproger.ru/uploads/2020/07/xsolla-50x50.png'
     url = Url(URL)
     self.assertEqual(url.filename, 'xsolla-50x50.png')