Exemplo n.º 1
0
    def __init__(
        self,
        base_domain: str,
        domains: set[str],
        base_url: URL,
        directory: str,
        deep: int,
        save: bool,
        pool_count: int,
        disallow_ends: tuple
    ):
        self.scheme = base_url.scheme
        self.base_domain = base_domain
        self.domains = domains
        self.queue = Queue()
        self.crawled = []
        self.working_links = set()
        self.disallow_ends = disallow_ends
        self.directory = directory + "/"
        self.save = save
        self.deep = deep
        self.count = 0

        self.pool = ThreadPool(pool_count)

        self.robots_parser = RobotsTxtParser(base_url.human_repr())
        try:
            self.robots_parser.get_robots_txt()
        except CrawlerError as e:
            logging.warning(f"-RobotsTxT {e.message}")
            self.robots_parser.death = True
        self.crawl_page(base_url.human_repr())
Exemplo n.º 2
0
def test_human_repr_ipv6():
    url = URL("http://[::1]:8080/path")
    s = url.human_repr()
    url2 = URL(s)
    assert url2 == url
    assert url2.host == "::1"
    assert s == "http://[::1]:8080/path"
Exemplo n.º 3
0
    def _req(client: httpx.Client, url: URL, headers: Dict):
        logger.debug(f'request url is {url}')
        res = client.get(url.human_repr(), headers=headers)
        if res.status_code != 200:
            res.raise_for_status()

        return res
Exemplo n.º 4
0
def file_from_url(path: str, url: yarl.URL, *, mode: int = 0o644) -> dict:
    return {
        "path": path,
        "contents": {
            "source": url.human_repr()
        },
        "mode": mode,
        "overwrite": True,
    }
Exemplo n.º 5
0
 def find_links(self, html: str, url: URL) -> set:
     result = set()
     host_name = str(url.origin())
     soup = BeautifulSoup(html, "html.parser")
     for obj in soup.find_all("a", href=True):
         link = obj["href"].lower()
         if link.endswith(self.disallow_ends) or link.startswith("#"):
             continue
         link = URL(link)
         if self.check_domains(link) and link.human_repr().startswith(
             url.scheme
         ):
             result.add(self.get_normal_link(link))
         elif link.scheme == "":
             if not link.human_repr().startswith("/"):
                 result.add(url.human_repr() + link.human_repr())
             else:
                 result.add(f'{host_name}{link.human_repr()}')
     return result
Exemplo n.º 6
0
    def _download_img(self, url: URL) -> bytes:
        if not url.is_absolute():
            url = self.url.join(url)

        logger.info(f'downloading image from [{url}]')

        remote_img = httpx.get(url.human_repr(), headers=header(url))
        if remote_img.status_code != 200:
            remote_img.raise_for_status()

        return remote_img.content
Exemplo n.º 7
0
 def get_links(self, url: URL) -> set:
     site_info = self.get_content(url.human_repr())
     if site_info.status_code != 200:
         logging.warning(
             f"{site_info.status_code} status code in get_links"
             f"({url.human_repr()})"
         )
         return set()
     html = site_info.text
     if self.save:
         path = Path.cwd() / self.directory / url.host
         FileWorker.save_link(path, url, html)
     return self.find_links(html, url)
Exemplo n.º 8
0
 def _make_request(
     self,
     method: HttpMethod,
     url: URL,
     params: Optional[Dict[str, Any]] = None,
     json: Optional[Dict[str, Any]] = None,
     data: Optional[Union[str, bytes, Mapping[Any, Any]]] = None,
     auth: Optional[AuthBase] = None,
     raise_for_status: bool = True,
 ) -> requests.Response:
     response = requests.request(
         method=method.value,
         url=url.human_repr(),
         params=params,
         json=json,
         data=data,
         auth=auth,
         timeout=self._settings.timeout,
     )
     if raise_for_status:
         response.raise_for_status()
     return response
Exemplo n.º 9
0
def test_human_repr():
    url = URL("http://вася:пароль@хост.домен:8080/путь/сюда?арг=вал#фраг")
    s = url.human_repr()
    assert URL(s) == url
    assert s == "http://вася:пароль@хост.домен:8080/путь/сюда?арг=вал#фраг"
Exemplo n.º 10
0
def test_human_repr_default_port():
    url = URL("http://вася:пароль@хост.домен/путь/сюда?арг=вал#фраг")
    s = url.human_repr()
    assert s == "http://вася:пароль@хост.домен/путь/сюда?арг=вал#фраг"
Exemplo n.º 11
0
def test_human_repr_defaults():
    url = URL("путь")
    s = url.human_repr()
    assert s == "путь"
Exemplo n.º 12
0
def test_human_repr_defaults():
    url = URL('путь')
    s = url.human_repr()
    assert s == 'путь'
Exemplo n.º 13
0
def test_human_repr():
    url = URL('http://вася:пароль@хост.домен:8080/путь/сюда?арг=вал#фраг')
    s = url.human_repr()
    assert s == 'http://вася:пароль@хост.домен:8080/путь/сюда?арг=вал#фраг'
Exemplo n.º 14
0
class XMLTVListing(Hashable):  # pylint: disable=too-many-instance-attributes
    """Hold the data and functions required to obtain an XMLTVListing.

    Currently it only supports the XML TV Listings format which holds
    Channel and Programme scheduling information. See the dtd_ for details.

    .. _dtd: https://github.com/AlekSi/xmltv/blob/master/xmltv.dtd

    Note:

        The :class:`~.channel.Channel` class provides the **primary**
        interface to channel data through the :class:`~.epg.EPG` class.

        This class provides the means to download and parse XML data to do with
        channels, but more importantly, programming schedules. While it can be used
        stand-alone, it is most effective when injected into the
        :class:`~.epg.EPG` object using
        :meth:`.epg.EPG.apply_XMLTVListing()`.

    """
    def __init__(
            self,
            url: URL,
            path: PathLike = Path('.epg_data'),
    ) -> None:
        """Instantiate the object with the URL to fetch."""
        self._url: URL
        self._path: Path

        if not isinstance(url, URL):
            self._url = URL(url)
        else:
            self._url = url

        # Validate path
        if not isinstance(path, Path) and not isinstance(path, str):
            raise TypeError('path must be a string or Path object.')

        # coerce it to a Path if it's a string
        if isinstance(path, str):
            self._path = Path(path)
        else:
            self._path = path

        if not self._path.is_dir():
            self._path.mkdir()

        str_binary = str(self._url.human_repr()).encode('utf8')
        hashobj = hashlib.sha256(str_binary)
        self._hash = int.from_bytes(hashobj.digest(), 'big')
        self._filename = f'{hashobj.hexdigest()}.xml'
        self._full_path = self._path.joinpath(self._filename)
        self._last_modified: Optional[datetime] = None
        self._downloaded: bool = False
        self._downloading: bool = False

        LOGGER.debug(f'XMLTVListing initialised: {self}')

    def __hash__(self) -> int:
        """Define hash function for a Hashable object."""
        return self._hash

    def __eq__(self, other) -> bool:
        """Define equality test for a Hashable object."""
        # relying on lazy boolean evaluation here.
        return isinstance(other, XMLTVListing) and hash(other._url) == hash(
            self._url)  # pylint: disable=protected-access

    def __repr__(self):
        """Print a human-friendly representation of this object."""
        return f"<XMLTVListing: url='{self._url}', path='{self._path}', filename='{self._filename}'>"

    @property
    def last_modified(self):
        """Return the last modified date from the HTTP header of the last download.

        Returns the date and time when the data was last modified. Taken directly
        from the ``HTTP-Header``.

        If the header was not provided, it returns ``None``.

        Returns:
            datetime: A :py:class:`datetime.datetime` object or ``None``.

        """
        return self._last_modified

    @property
    def url(self) -> URL:
        """Return the url of this XMLTVListing.

        Returns:
            :py:class:`yarl.URL`

        """
        return self._url

    @property
    def downloaded(self) -> bool:
        """Return the status of XMLTV file download.

        Returns:
            bool: ``True`` if the file was downloaded successfully

        """
        return self._downloaded

    @property
    def downloading(self) -> bool:
        """Return the status of XMLTV file download.

        Returns:
            bool: ``True`` if the file is currently being downlaoded

        """
        return self._downloading

    @property
    def file_path(self) -> Path:
        """Return the full file path of this listing's XML file.

        Returns:
            :py:class:`pathlib.Path`: A `Path` to the location of the
                XML file (whether it has yet been :meth:`fetch`'ed or not).

        """
        return self._full_path

    # TODO -- add error handling for
    # -- HTTP headers missing
    # -- timeouts
    # -- badURL, etc, etc.
    # TODO -- add retry support

    async def fetch(self) -> None:
        """Fetch the XMLTVListing file.

        This async method will download the (XML) file specified by the URL passed
        at instantiation.

        If the server does not support streaming downloads the method will will
        fall back to a more memory-intensive, vanilla HTTP download.

        Args:
            timeout (int): timeout in seconds for the HTTP session. Defaults to
                60 seconds.
            range_size (int): the size, in bytes, of each chunk. Defaults to
                256k (256*1024 bytes).

        Returns:
            None

        """
        LOGGER.debug(f'Fetch({self}) call started.')
        self._downloading = True
        newfile = self._full_path.with_suffix('.tmp')

        # We support gzip encoding as www.xmltv.co.uk sends stuff gzipped.
        # the zlib decompressiion object supports streaming decompression which
        # why we use it.

        # We should probably support more encoding algorithms at some point.

        decompression_obj = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS)
        compressed_payload = True  # assume payload is compressed first.

        async def chunk_processor(decomp_obj, bytechunk):
            nonlocal compressed_payload
            async with await trio.open_file(newfile, 'ab') as output_file:
                if compressed_payload:
                    try:
                        LOGGER.debug(
                            f'Got zipped chunk. size = {len(bytechunk)}. Unzipping...'
                        )
                        await output_file.write(
                            decomp_obj.decompress(bytechunk))
                        LOGGER.debug(
                            f'Wrote unzipped chunk. size = {len(bytechunk)}')
                    except zlib.error:
                        await output_file.write(bytechunk)
                        LOGGER.debug(f'Wrote chunk. size = {len(bytechunk)}')
                        compressed_payload = False  # it wasn't compressed, so fall back
                else:
                    await output_file.write(bytechunk)
                    LOGGER.debug(f'Wrote chunk. size = {len(bytechunk)}')

        resp = await asks.get(
            str(self._url),
            headers={'Accept-Encoding': 'gzip'},
            # callback takes bytes only
            callback=partial(chunk_processor, decompression_obj))

        LOGGER.debug(f'Got headers = {resp.headers}')

        # h11 makes header keys lowercase so we can reply on this
        if resp.headers.get("last-modified"):
            self._last_modified = parse_http_date(
                resp.headers.get("last-modified"))
            LOGGER.debug(
                f'Content last modified on: {resp.headers.get("last-modified")}.'
            )

        shutil.move(newfile, self._full_path)
        self._downloading = False
        self._downloaded = True
        LOGGER.debug(f'Fetch finished on {self}')

    def parse(self) -> Iterator[Union[Channel, Programme]]:
        """Parse the XMLTVListing XML file and create an iterator over the data in it.

        Yields:
            (Any): Either a  :class:`~pyskyq.channel.Channel` or a
                :class:`~pyskyq.programme.Programme` object is yielded.

        """
        if not self.downloaded and not self.downloading:
            raise OSError(
                'File not downloaded, or download is currently in flight.')
        else:
            LOGGER.debug(f'in parse_channels. file = {self.file_path}')
            for xml_type, xml_item in _xml_parse_and_remove(
                    self._full_path):  # type: ignore
                if xml_type == 'channel':
                    LOGGER.debug('yielding channel...')
                    yield channel_from_xmltv_list(xml_item)
                if xml_type == 'programme':
                    LOGGER.debug('yielding programme...')
                    yield programme_from_xmltv_list(xml_item)
Exemplo n.º 15
0
def test_human_repr_default_port():
    url = URL("http://вася:пароль@хост.домен/путь/сюда?арг=вал#фраг")
    s = url.human_repr()
    assert s == "http://вася:пароль@хост.домен/путь/сюда?арг=вал#фраг"
Exemplo n.º 16
0
def test_human_repr_defaults():
    url = URL("путь")
    s = url.human_repr()
    assert s == "путь"
Exemplo n.º 17
0
class SpiderCrawler:
    def __init__(self, start_url, database, depth):
        self.client = httpx.AsyncClient()
        self.url = URL(start_url)
        self.db = database
        self.depth = depth

    @timer
    async def get_data_from_url(self):
        calls = 0

        @not_retries
        async def load(url_: URL, level_):
            nonlocal calls
            calls += 1
            try:
                title, html_body, soup = await self._load_and_parse(url_)
            except TypeError:
                # Can't download
                return

            asyncio.ensure_future(
                self.db.save_to_db(url_,
                                   title,
                                   html_body,
                                   parent=self.url.human_repr()))

            if level_ >= self.depth:
                return

            refs = self._ref_generator(soup.findAll('a'))
            todos = [load(ref, level_ + 1) for ref in refs]
            await asyncio.gather(*todos)

        try:
            await load(self.url, 0)
        finally:
            print("CALLS: ", calls)
            await self.client.aclose()
            await self.db.pg.pool.close()

    async def _load_and_parse(self, url_: URL):
        try:
            res = await self.client.get(str(url_))
        except httpx.HTTPError:
            return
        except ValueError:
            return

        soup = BeautifulSoup(res, 'lxml')

        try:
            title = soup.title.text
        except AttributeError:
            title = None
        html_body = res.text

        return title, html_body, soup

    def _ref_generator(self, bs_result_set):

        for ref in bs_result_set:
            try:
                href = URL(ref.attrs['href'])

                if href.query_string:  # Without QS
                    continue

                if not href.is_absolute():
                    href = self.url.join(href)

                if href != self.url:
                    yield href
            except KeyError:
                continue
Exemplo n.º 18
0
 def check_domains(self, link: URL) -> bool:
     for domain in self.domains:
         if domain in link.human_repr():
             return True
     return False
Exemplo n.º 19
0
class Request(Task):
    """Request is a Task that execute :meth:`fetch` method.

    Attributes:
        url:
        callback: should be a callable function or a list of functions.
            It will be passed to the corresponding response task.
        family: this family will be appended in families and also passed to corresponding
            response task.
        status_allowed: a list of allowed status integer. Otherwise any response task
            with `status!=200` will fail and retry.
        meta: a dictionary to deliver information. It will be passed to :attr:`Response.meta`.
        request_config: a dictionary, will be passed as keyword arguments
            to :meth:`aiohttp.ClientSession.request`.

            acceptable keyword:

                params - Dictionary or bytes to be sent in the query string of the new request

                data - Dictionary, bytes, or file-like object to send in the body of the request

                json - Any json compatible python object

                headers - Dictionary of HTTP Headers to send with the request

                cookies - Dict object to send with the request

                allow_redirects - If set to False, do not follow redirects

                timeout - Optional ClientTimeout settings structure, 5min total timeout by default.

    """

    def __init__(
        self,
        url: _LooseURL,
        callback: _Functions = None,
        method: str = "GET",
        request_config: dict = None,
        status_allowed: list = None,
        encoding=None,
        links_to_abs=True,
        # Below are paras for parent class
        dont_filter: bool = False,
        ignore_exception: bool = False,
        meta: dict = None,
        priority: int = 0,
        family=None,
        family_for_response=None,
        recrawl=0,
        exetime=0,
        **kwargs,
    ):
        super().__init__(
            dont_filter=dont_filter,
            ignore_exception=ignore_exception,
            priority=priority,
            meta=meta,
            family=family,
            recrawl=recrawl,
            exetime=exetime,
            **kwargs,
        )

        self.url = URL(url)
        self.method = method
        self.status_allowed = status_allowed
        self.callbacks = []
        if callback:
            self.add_callback(callback)
        self.request_config = request_config if request_config else {}
        self.session = None
        self.client = None
        self.response: Response = None
        self.family_for_response = family_for_response
        self.encoding = encoding
        self.links_to_abs = links_to_abs

        self.inprogress = False  # is this request start execution; for counter

    @property
    def url_str(self):
        return self.url.human_repr()

    @property
    def url_str_canonicalized(self):
        query_str = "&".join(sorted(self.url.raw_query_string.split("&")))
        return (
            str(self.url)
            .replace(self.url.raw_query_string, query_str)
            .replace("#" + self.url.raw_fragment, "")
        )

    def add_callback(self, func: _Function):
        if isinstance(func, Iterable):
            for f in func:
                self.callbacks.append(f)
        else:
            self.callbacks.append(func)

    def reset_callback(self):
        self.callbacks = []

    def _fingerprint(self):
        """fingerprint for a request task.
        .. todo::write a better hashing function for request.
        """
        fp = hashlib.sha1()
        fp.update(self.url_str_canonicalized.encode())
        fp.update(self.method.encode())
        return fp.hexdigest()

    async def _execute(self, **kwargs):
        """Wraps :meth:`fetch`"""
        yield await self.fetch()

    async def send(self):
        """This method is used for independent usage of Request without Crawler.
        """
        resp = None
        async for task in self.execute():
            if isinstance(task, Response):
                resp = task
        return resp

    async def fetch(self):
        """Sends a request and return the response as a task."""
        to_close = False

        if self.session is None:
            self.session = aiohttp.ClientSession()
            to_close = True
        try:
            async with self.session.request(
                self.method, self.url, **self.request_config
            ) as cresp:

                body = await cresp.read()
                encoding = self.encoding or cresp.get_encoding()

                self.response = Response(
                    url=cresp.url,
                    status=cresp.status,
                    cookies=cresp.cookies,
                    headers=cresp.headers.copy(),
                    body=body,
                    encoding=encoding,
                    links_to_abs=self.links_to_abs,
                    callbacks=self.callbacks.copy(),
                    request=self,
                    family=self.family_for_response,
                )
                rt = self.response
                logger.info(f"<{self.response.status}> {self.response.url_str}")
                return rt
        except Exception as e:
            raise e
        finally:
            if to_close:
                await self.session.close()

    def __str__(self):
        return f"<Task {self.primary_family}> ({self.url.human_repr()})"

    def __getstate__(self):
        state = super().__getstate__()
        state.pop("session", None)
        state.pop("client", None)
        if 'exceptions' in state:
            state['exceptions'] = []
        return state

    def __setstate__(self, state):
        super().__setstate__(state)
        self.__dict__["session"] = None
        self.__dict__["client"] = None