Exemplo n.º 1
0
def get_binary_content(download_url, cookies, method="GET"):
    """Downloads the file, covering a few special cases such as invalid SSL
    certificates and empty file errors.

    :param download_url: The URL for the item you wish to download.
    :param cookies: Cookies that might be necessary to download the item.
    :param method: The HTTP method used to get the item, or "LOCAL" to get an
    item during testing
    :return: Two values. The first is a msg indicating any errors encountered.
    If blank, that indicates success. The second value is the response object
    containing the downloaded file.
    """
    if not download_url:
        # Occurs when a DeferredList fetcher fails.
        msg = "NoDownloadUrlError: %s\n%s" % (
            download_url,
            traceback.format_exc(),
        )
        return msg, None
    # noinspection PyBroadException
    try:
        if method == "LOCAL":
            url = os.path.join(settings.MEDIA_ROOT, download_url)
            mr = MockRequest(url=url)
            r = mr.get()
        else:
            # Note that we do a GET even if site.method is POST. This is
            # deliberate.
            s = requests.session()
            headers = {"User-Agent": "CourtListener"}

            r = s.get(
                download_url,
                verify=False,  # WA has a certificate we don't understand
                headers=headers,
                cookies=cookies,
                timeout=300,
            )

            # test for empty files (thank you CA1)
            if len(r.content) == 0:
                msg = "EmptyFileError: %s\n%s" % (
                    download_url,
                    traceback.format_exc(),
                )
                return msg, None

            # test for and follow meta redirects
            r = follow_redirections(r, s)

            r.raise_for_status()
    except:
        msg = "DownloadingError: %s\n%s" % (
            download_url,
            traceback.format_exc(),
        )
        return msg, None

    # Success!
    return "", r
Exemplo n.º 2
0
def get_binary_content(download_url, cookies, adapter, method='GET'):
    """ Downloads the file, covering a few special cases such as invalid SSL
    certificates and empty file errors.

    :param download_url: The URL for the item you wish to download.
    :param cookies: Cookies that might be necessary to download the item.
    :param adapter: An HTTPAdapter for use when getting content.
    :param method: The HTTP method used to get the item, or "LOCAL" to get an
    item during testing
    :return: Two values. The first is a msg indicating any errors encountered.
    If blank, that indicates success. The second value is the response object
    containing the downloaded file.
    """
    if not download_url:
        # Occurs when a DeferredList fetcher fails.
        msg = 'NoDownloadUrlError: %s\n%s' % (download_url,
                                              traceback.format_exc())
        return msg, None
    # noinspection PyBroadException
    try:
        if method == 'LOCAL':
            url = os.path.join(
                settings.MEDIA_ROOT,
                download_url)
            mr = MockRequest(url=url)
            r = mr.get()
        else:
            # Note that we do a GET even if site.method is POST. This is
            # deliberate.
            s = requests.session()
            s.mount('https://', adapter)
            headers = {'User-Agent': 'CourtListener'}

            r = s.get(
                download_url,
                verify=False,  # WA has a certificate we don't understand
                headers=headers,
                cookies=cookies,
                timeout=300,
            )

            # test for empty files (thank you CA1)
            if len(r.content) == 0:
                msg = 'EmptyFileError: %s\n%s' % (download_url,
                                                  traceback.format_exc())
                return msg, None

            # test for and follow meta redirects
            r = follow_redirections(r, s)

            r.raise_for_status()
    except:
        msg = 'DownloadingError: %s\n%s' % (download_url,
                                            traceback.format_exc())
        return msg, None

    # Success!
    return '', r
Exemplo n.º 3
0
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == "POST":
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, ellipsis="...[truncated]")
            logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)

        # Set up verify here and remove it from request_dict so you don't send
        # it to s.get or s.post in two kwargs.
        if request_dict.get("verify") is not None:
            verify = request_dict["verify"]
            del request_dict["verify"]
        else:
            verify = certifi.where()

        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        s.mount("https://", self._get_adapter_instance())
        if self.method == "GET":
            r = s.get(self.url, headers={"User-Agent": "Juriscraper"}, verify=verify, **request_dict)
        elif self.method == "POST":
            r = s.post(
                self.url, headers={"User-Agent": "Juriscraper"}, verify=verify, data=self.parameters, **request_dict
            )
        elif self.method == "LOCAL":
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # Tweak or set the encoding if needed
        r = self._set_encoding(r)

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        # Grab the content
        if "json" in r.headers.get("content-type", ""):
            return r.json()
        else:
            text = self._clean_text(r.text)
            html_tree = self._make_html_tree(text)
            html_tree.rewrite_links(self._link_repl)
            return html_tree
Exemplo n.º 4
0
 def _request_url_mock(self, url):
     """Execute mock request, used for testing"""
     self.request["url"] = url
     self.request["response"] = MockRequest(url=self.url).get()
Exemplo n.º 5
0
 def _request_url_mock(self, url):
     """Execute mock request, used for testing"""
     self.request['url'] = url
     self.request['request'] = MockRequest(url=self.url).get()