Exemplo n.º 1
0
 def test_can_crawl_crawlable_url_additional_rules(self):
     test_filter = URLFilter([
         urlparse('http://www.example1.com').netloc,
         urlparse('http://www.example2.com').netloc
     ], ['/adult-content', 'example2.com/product'], False)
     can_crawl = test_filter.can_crawl(
         'www.example1.com', 'http://www.example1.com/something-good')
     self.assertTrue(can_crawl)
     can_crawl = test_filter.can_crawl(
         'www.example2.com', 'http://www.example2.com/something-good')
     self.assertTrue(can_crawl)
Exemplo n.º 2
0
 def test_can_crawl_do_not_crawl_additional_rules(self):
     test_filter = URLFilter([
         urlparse('http://www.example1.com').netloc,
         urlparse('http://www.example2.com').netloc
     ], ['/adult-content', 'example2.com/product'], False)
     can_crawl = test_filter.can_crawl(
         'www.example1.com', 'http://www.example1.com/adult-content')
     self.assertFalse(can_crawl)
     can_crawl = test_filter.can_crawl(
         'www.example2.com', 'http://www.example2.com/product-101')
     self.assertFalse(can_crawl)
Exemplo n.º 3
0
 def _set_url_filter(start_url, **kwargs) -> URLFilter:
     custom_filter = kwargs.get('custom_filter')
     if custom_filter and issubclass(custom_filter, URLFilter):
         return custom_filter(start_url, kwargs.get('additional_rules', []),
                              kwargs.get('follow_robots', True),
                              kwargs.get('defragment_urls', True))
     return URLFilter(start_url, kwargs.get('additional_rules', []),
                      kwargs.get('follow_robots', True),
                      kwargs.get('defragment_urls', True))
Exemplo n.º 4
0
 def _set_url_filter(start_url, **kwargs) -> URLFilter:
     custom_filter = kwargs.get("custom_filter")
     if custom_filter and issubclass(custom_filter, URLFilter):
         return custom_filter(
             start_url,
             kwargs.get("additional_rules", []),
             kwargs.get("follow_robots", True),
             kwargs.get("defragment_urls", True),
         )
     return URLFilter(
         start_url,
         kwargs.get("additional_rules", []),
         kwargs.get("follow_robots", True),
         kwargs.get("defragment_urls", True),
     )
Exemplo n.º 5
0
def link_extractor(
    response: Response, url_filter: URLFilter, defrag: bool
) -> (str, List[str]):
    html = response.body
    req_url = response.url
    dom = lh.fromstring(html)
    found_urls = []
    for href in dom.xpath("//a/@href"):
        url = urljoin(str(req_url), href)
        if defrag:
            url = urldefrag(url)[0]
        netloc = urlparse(url).netloc
        can_crawl = url_filter.can_crawl(netloc, url)
        if can_crawl:
            found_urls.append(url)
    return response, found_urls
Exemplo n.º 6
0
def link_extractor(response: ClientResponse, url_filter: URLFilter,
                   defrag: bool) -> (str, List[str]):
    html = response._body.decode('utf-8', errors='ignore')
    req_url = response._url
    dom = lh.fromstring(html)
    response.__setattr__('dom', dom)
    response.__setattr__('html', html)
    found_urls = []
    for href in dom.xpath('//a/@href'):
        url = urljoin(str(req_url), href)
        if defrag:
            url = urldefrag(url)[0]
        netloc = urlparse(url).netloc
        can_crawl = url_filter.can_crawl(netloc, url)
        if can_crawl and valid_url(url):
            found_urls.append(url)
    return response, found_urls
Exemplo n.º 7
0
    def __init__(self,
                 spalsh_configuration: SplashConfiguration,
                 start_url: Union[List[str], str],
                 max_crawl_size: Union[int, None] = None,
                 timeout=30,
                 user_agent=None,
                 **kwargs):
        self.splash_configuration = spalsh_configuration
        self._start_url = start_url
        self._client: Union[None, ClientSession] = None
        self._client_timeout: Union[None, ClientTimeout] = None
        self._timeout: Union[int, None] = None

        self._proxy_manager: \
            Union[None, AbstractProxyManager] = kwargs.get('proxy_manager')(**kwargs) if kwargs.get(
            'proxy_manager') else None

        custom_filter = kwargs.get('custom_filter')
        if custom_filter and issubclass(custom_filter, URLFilter):
            self._url_filter = custom_filter(
                start_url, kwargs.get('additional_rules', []),
                kwargs.get('follow_robots'), True)
        else:
            self._url_filter = URLFilter(start_url,
                                         kwargs.get('additional_rules', []),
                                         kwargs.get('follow_robots', True))

        self._task_queue = NewJobQueue(max_crawl_size, start_url)

        self._executor = kwargs.get('executor', None)
        logging.basicConfig(level=logging.DEBUG, format='%(message)s')
        self._logger = kwargs.get('logger', logging.getLogger("Scraper"))
        self._client_timeout = self._setup_timeout_rules(timeout)
        self.__remaining_coroutines = 0
        self.__user_agent = user_agent
        self.__creation_semaphore = asyncio.BoundedSemaphore(1)
Exemplo n.º 8
0
 def test_can_crawl_do_not_crawl(self):
     test_filter = URLFilter([urlparse('http://www.example.com').netloc],
                             None, False)
     can_crawl = test_filter.can_crawl('www.no-crawl.com',
                                       'http://www.no-crawl.com')
     self.assertFalse(can_crawl)
Exemplo n.º 9
0
 def test_can_crawl_crawlable_url(self):
     test_filter = URLFilter([urlparse('http://www.example.com').netloc],
                             None, False)
     can_crawl = test_filter.can_crawl(
         'www.example.com', 'http://example.com/something/something-else')
     self.assertTrue(can_crawl)