Пример #1
0
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" %
                        (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)

        # Set up verify here and remove it from request_dict so you don't send
        # it to s.get or s.post in two kwargs.
        if request_dict.get('verify') is not None:
            verify = request_dict['verify']
            del request_dict['verify']
        else:
            verify = certifi.where()

        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        s.mount('https://', self._get_adapter_instance())
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      verify=verify,
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       verify=verify,
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # Tweak or set the encoding if needed
        r = self._set_encoding(r)

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        # Grab the content
        if 'json' in r.headers.get('content-type', ''):
            return r.json()
        else:
            text = self._clean_text(r.text)
            html_tree = self._make_html_tree(text)
            html_tree.rewrite_links(self._link_repl)
            return html_tree
Пример #2
0
def get_binary_content(download_url, cookies, adapter, method='GET'):
    """ Downloads the file, covering a few special cases such as invalid SSL
    certificates and empty file errors.

    :param download_url: The URL for the item you wish to download.
    :param cookies: Cookies that might be necessary to download the item.
    :param adapter: An HTTPAdapter for use when getting content.
    :param method: The HTTP method used to get the item, or "LOCAL" to get an
    item during testing
    :return: Two values. The first is a msg indicating any errors encountered.
    If blank, that indicates success. The second value is the response object
    containing the downloaded file.
    """
    if not download_url:
        # Occurs when a DeferredList fetcher fails.
        msg = 'NoDownloadUrlError: %s\n%s' % (download_url,
                                              traceback.format_exc())
        return msg, None
    # noinspection PyBroadException
    try:
        if method == 'LOCAL':
            url = os.path.join(
                settings.MEDIA_ROOT,
                download_url)
            mr = MockRequest(url=url)
            r = mr.get()
        else:
            # Note that we do a GET even if site.method is POST. This is
            # deliberate.
            s = requests.session()
            s.mount('https://', adapter)
            headers = {'User-Agent': 'CourtListener'}

            r = s.get(
                download_url,
                verify=False,  # WA has a certificate we don't understand
                headers=headers,
                cookies=cookies
            )

            # test for empty files (thank you CA1)
            if len(r.content) == 0:
                msg = 'EmptyFileError: %s\n%s' % (download_url,
                                                  traceback.format_exc())
                return msg, None

            # test for and follow meta redirects
            r = follow_redirections(r, s)

            r.raise_for_status()
    except:
        msg = 'DownloadingError: %s\n%s' % (download_url,
                                            traceback.format_exc())
        return msg, None

    # Success!
    return '', r
Пример #3
0
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)
        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
        if r.encoding == 'ISO-8859-1':
            r.encoding = 'cp1252'

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        if r.encoding is None:
            # Requests detects the encoding when the item is GET'ed using
            # HTTP headers, and then when r.text is accessed, if the encoding
            # hasn't been set by that point. By setting the encoding here, we
            # ensure that it's done by cchardet, if it hasn't been done with
            # HTTP headers. This way it is done before r.text is accessed
            # (which would do it with vanilla chardet). This is a big
            # performance boon, and can be removed once requests is upgraded
            # (https://github.com/kennethreitz/requests/pull/814/)
            r.encoding = chardet.detect(r.content)['encoding']

        # Grab the content
        text = self._clean_text(r.text)
        html_tree = html.fromstring(text)
        html_tree.rewrite_links(self._link_repl)
        return html_tree
Пример #4
0
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)
        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
        if r.encoding == 'ISO-8859-1':
            r.encoding = 'cp1252'

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        if r.encoding is None:
            # Requests detects the encoding when the item is GET'ed using
            # HTTP headers, and then when r.text is accessed, if the encoding
            # hasn't been set by that point. By setting the encoding here, we
            # ensure that it's done by cchardet, if it hasn't been done with
            # HTTP headers. This way it is done before r.text is accessed
            # (which would do it with vanilla chardet). This is a big
            # performance boon, and can be removed once requests is upgraded
            # (https://github.com/kennethreitz/requests/pull/814/)
            r.encoding = chardet.detect(r.content)['encoding']

        # Grab the content
        text = self._clean_text(r.text)
        html_tree = html.fromstring(text)
        html_tree.rewrite_links(self._link_repl)
        return html_tree
Пример #5
0
def get_binary_content(download_url, cookies, method='GET'):
    """ Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors.

    :param download_url: The URL for the item you wish to download.
    :param cookies: Cookies that might be necessary to download the item.
    :param method: The HTTP method used to get the item, or "LOCAL" to get an item during testing
    :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The
    second value is the response object containing the downloaded file.
    """
    if not download_url:
        # Occurs when a DeferredList fetcher fails.
        msg = 'NoDownloadUrlError: %s\n%s' % (download_url,
                                              traceback.format_exc())
        return msg, None
    # noinspection PyBroadException
    try:
        if method == 'LOCAL':
            mr = MockRequest(
                url=os.path.join(settings.INSTALL_ROOT, 'alert', download_url))
            r = mr.get()
        else:
            # Note that we do a GET even if site.method is POST. This is deliberate.
            s = requests.session()
            headers = {'User-Agent': 'CourtListener'}
            cookies = normalize_cookies(cookies)
            logger.info("Using cookies: %s" % cookies)
            try:
                r = s.get(download_url, headers=headers, cookies=cookies)
            except SSLError:
                # Washington has a certificate we don't understand.
                r = s.get(download_url,
                          verify=False,
                          headers=headers,
                          cookies=cookies)

            # test for empty files (thank you CA1)
            if len(r.content) == 0:
                msg = 'EmptyFileError: %s\n%s' % (download_url,
                                                  traceback.format_exc())
                return msg, r

            # test for and follow meta redirects
            r = follow_redirections(r, s)
    except:
        msg = 'DownloadingError: %s\n%s' % (download_url,
                                            traceback.format_exc())
        print msg
        return msg, r

    # Success!
    return '', r
Пример #6
0
def get_binary_content(download_url, cookies, method='GET'):
    """ Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors.

    :param download_url: The URL for the item you wish to download.
    :param cookies: Cookies that might be necessary to download the item.
    :param method: The HTTP method used to get the item, or "LOCAL" to get an item during testing
    :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The
    second value is the response object containing the downloaded file.
    """
    if not download_url:
        # Occurs when a DeferredList fetcher fails.
        msg = 'NoDownloadUrlError: %s\n%s' % (download_url, traceback.format_exc())
        return msg, None
    # noinspection PyBroadException
    try:
        if method == 'LOCAL':
            mr = MockRequest(url=os.path.join(settings.INSTALL_ROOT, 'alert', download_url))
            r = mr.get()
        else:
            # Note that we do a GET even if site.method is POST. This is deliberate.
            s = requests.session()
            headers = {'User-Agent': 'CourtListener'}
            cookies = normalize_cookies(cookies)
            logger.info("Using cookies: %s" % cookies)
            try:
                r = s.get(download_url,
                          headers=headers,
                          cookies=cookies)
            except SSLError:
                # Washington has a certificate we don't understand.
                r = s.get(download_url,
                          verify=False,
                          headers=headers,
                          cookies=cookies)

            # test for empty files (thank you CA1)
            if len(r.content) == 0:
                msg = 'EmptyFileError: %s\n%s' % (download_url, traceback.format_exc())
                return msg, r

            # test for and follow meta redirects
            r = follow_redirections(r, s)
    except:
        msg = 'DownloadingError: %s\n%s' % (download_url, traceback.format_exc())
        print msg
        return msg, r

    # Success!
    return '', r
Пример #7
0
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v,
                                            50,
                                            elipsize=True,
                                            elipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" %
                        (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)
        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
        if r.encoding == 'ISO-8859-1':
            r.encoding = 'cp1252'

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        # Grab the content
        text = self._clean_text(r.text)
        html_tree = html.fromstring(text)
        html_tree.rewrite_links(self._link_repl)
        return html_tree
Пример #8
0
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)
        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        if self.method == 'GET':
            r = s.get(self.url,
                      headers={'User-Agent': 'Juriscraper'},
                      **request_dict)
        elif self.method == 'POST':
            r = s.post(self.url,
                       headers={'User-Agent': 'Juriscraper'},
                       data=self.parameters,
                       **request_dict)
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
        if r.encoding == 'ISO-8859-1':
            r.encoding = 'cp1252'

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        # Grab the content
        text = self._clean_text(r.text)
        html_tree = html.fromstring(text)
        html_tree.rewrite_links(self._link_repl)
        return html_tree
Пример #9
0
    def _download(self, request_dict={}):
        """Methods for downloading the latest version of Site
        """
        if self.method == 'POST':
            truncated_params = {}
            for k, v in self.parameters.iteritems():
                truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]')
            logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params))
        else:
            logger.info("Now downloading case page at: %s" % self.url)

        # Set up verify here and remove it from request_dict so you don't send
        # it to s.get or s.post in two kwargs.
        if request_dict.get('verify') is not None:
            verify = request_dict['verify']
            del request_dict['verify']
        else:
            verify = certifi.where()

        # Get the response. Disallow redirects so they throw an error
        s = requests.session()
        s.mount('https://', self._get_adapter_instance())
        if self.method == 'GET':
            r = s.get(
                self.url,
                headers={'User-Agent': 'Juriscraper'},
                verify=verify,
                **request_dict
            )
        elif self.method == 'POST':
            r = s.post(
                self.url,
                headers={'User-Agent': 'Juriscraper'},
                verify=verify,
                data=self.parameters,
                **request_dict
            )
        elif self.method == 'LOCAL':
            mr = MockRequest(url=self.url)
            r = mr.get()

        # Provides a hook for inheriting objects to tweak the request object.
        self.tweak_request_object(r)

        # Throw an error if a bad status code is returned.
        r.raise_for_status()

        # Tweak or set the encoding if needed
        r = self._set_encoding(r)

        # Provide the response in the Site object
        self.r = r
        self.status = r.status_code

        # Grab the content
        if 'json' in r.headers.get('content-type', ''):
            return r.json()
        else:
            text = self._clean_text(r.text)
            html_tree = self._make_html_tree(text)
            html_tree.rewrite_links(self._link_repl)
            return html_tree