def _download(self, request_dict={}): """Methods for downloading the latest version of Site """ if self.method == 'POST': truncated_params = {} for k, v in self.parameters.iteritems(): truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]') logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) # Set up verify here and remove it from request_dict so you don't send # it to s.get or s.post in two kwargs. if request_dict.get('verify') is not None: verify = request_dict['verify'] del request_dict['verify'] else: verify = certifi.where() # Get the response. Disallow redirects so they throw an error s = requests.session() s.mount('https://', self._get_adapter_instance()) if self.method == 'GET': r = s.get(self.url, headers={'User-Agent': 'Juriscraper'}, verify=verify, **request_dict) elif self.method == 'POST': r = s.post(self.url, headers={'User-Agent': 'Juriscraper'}, verify=verify, data=self.parameters, **request_dict) elif self.method == 'LOCAL': mr = MockRequest(url=self.url) r = mr.get() # Provides a hook for inheriting objects to tweak the request object. self.tweak_request_object(r) # Throw an error if a bad status code is returned. r.raise_for_status() # Tweak or set the encoding if needed r = self._set_encoding(r) # Provide the response in the Site object self.r = r self.status = r.status_code # Grab the content if 'json' in r.headers.get('content-type', ''): return r.json() else: text = self._clean_text(r.text) html_tree = self._make_html_tree(text) html_tree.rewrite_links(self._link_repl) return html_tree
def get_binary_content(download_url, cookies, adapter, method='GET'): """ Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors. :param download_url: The URL for the item you wish to download. :param cookies: Cookies that might be necessary to download the item. :param adapter: An HTTPAdapter for use when getting content. :param method: The HTTP method used to get the item, or "LOCAL" to get an item during testing :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The second value is the response object containing the downloaded file. """ if not download_url: # Occurs when a DeferredList fetcher fails. msg = 'NoDownloadUrlError: %s\n%s' % (download_url, traceback.format_exc()) return msg, None # noinspection PyBroadException try: if method == 'LOCAL': url = os.path.join( settings.MEDIA_ROOT, download_url) mr = MockRequest(url=url) r = mr.get() else: # Note that we do a GET even if site.method is POST. This is # deliberate. s = requests.session() s.mount('https://', adapter) headers = {'User-Agent': 'CourtListener'} r = s.get( download_url, verify=False, # WA has a certificate we don't understand headers=headers, cookies=cookies ) # test for empty files (thank you CA1) if len(r.content) == 0: msg = 'EmptyFileError: %s\n%s' % (download_url, traceback.format_exc()) return msg, None # test for and follow meta redirects r = follow_redirections(r, s) r.raise_for_status() except: msg = 'DownloadingError: %s\n%s' % (download_url, traceback.format_exc()) return msg, None # Success! return '', r
def _download(self, request_dict={}): """Methods for downloading the latest version of Site """ if self.method == 'POST': truncated_params = {} for k, v in self.parameters.iteritems(): truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]') logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) # Get the response. Disallow redirects so they throw an error s = requests.session() if self.method == 'GET': r = s.get(self.url, headers={'User-Agent': 'Juriscraper'}, **request_dict) elif self.method == 'POST': r = s.post(self.url, headers={'User-Agent': 'Juriscraper'}, data=self.parameters, **request_dict) elif self.method == 'LOCAL': mr = MockRequest(url=self.url) r = mr.get() # Provides a hook for inheriting objects to tweak the request object. self.tweak_request_object(r) # Throw an error if a bad status code is returned. r.raise_for_status() # If the encoding is iso-8859-1, switch it to cp1252 (a superset) if r.encoding == 'ISO-8859-1': r.encoding = 'cp1252' # Provide the response in the Site object self.r = r self.status = r.status_code if r.encoding is None: # Requests detects the encoding when the item is GET'ed using # HTTP headers, and then when r.text is accessed, if the encoding # hasn't been set by that point. By setting the encoding here, we # ensure that it's done by cchardet, if it hasn't been done with # HTTP headers. This way it is done before r.text is accessed # (which would do it with vanilla chardet). This is a big # performance boon, and can be removed once requests is upgraded # (https://github.com/kennethreitz/requests/pull/814/) r.encoding = chardet.detect(r.content)['encoding'] # Grab the content text = self._clean_text(r.text) html_tree = html.fromstring(text) html_tree.rewrite_links(self._link_repl) return html_tree
def get_binary_content(download_url, cookies, method='GET'): """ Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors. :param download_url: The URL for the item you wish to download. :param cookies: Cookies that might be necessary to download the item. :param method: The HTTP method used to get the item, or "LOCAL" to get an item during testing :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The second value is the response object containing the downloaded file. """ if not download_url: # Occurs when a DeferredList fetcher fails. msg = 'NoDownloadUrlError: %s\n%s' % (download_url, traceback.format_exc()) return msg, None # noinspection PyBroadException try: if method == 'LOCAL': mr = MockRequest( url=os.path.join(settings.INSTALL_ROOT, 'alert', download_url)) r = mr.get() else: # Note that we do a GET even if site.method is POST. This is deliberate. s = requests.session() headers = {'User-Agent': 'CourtListener'} cookies = normalize_cookies(cookies) logger.info("Using cookies: %s" % cookies) try: r = s.get(download_url, headers=headers, cookies=cookies) except SSLError: # Washington has a certificate we don't understand. r = s.get(download_url, verify=False, headers=headers, cookies=cookies) # test for empty files (thank you CA1) if len(r.content) == 0: msg = 'EmptyFileError: %s\n%s' % (download_url, traceback.format_exc()) return msg, r # test for and follow meta redirects r = follow_redirections(r, s) except: msg = 'DownloadingError: %s\n%s' % (download_url, traceback.format_exc()) print msg return msg, r # Success! return '', r
def get_binary_content(download_url, cookies, method='GET'): """ Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors. :param download_url: The URL for the item you wish to download. :param cookies: Cookies that might be necessary to download the item. :param method: The HTTP method used to get the item, or "LOCAL" to get an item during testing :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The second value is the response object containing the downloaded file. """ if not download_url: # Occurs when a DeferredList fetcher fails. msg = 'NoDownloadUrlError: %s\n%s' % (download_url, traceback.format_exc()) return msg, None # noinspection PyBroadException try: if method == 'LOCAL': mr = MockRequest(url=os.path.join(settings.INSTALL_ROOT, 'alert', download_url)) r = mr.get() else: # Note that we do a GET even if site.method is POST. This is deliberate. s = requests.session() headers = {'User-Agent': 'CourtListener'} cookies = normalize_cookies(cookies) logger.info("Using cookies: %s" % cookies) try: r = s.get(download_url, headers=headers, cookies=cookies) except SSLError: # Washington has a certificate we don't understand. r = s.get(download_url, verify=False, headers=headers, cookies=cookies) # test for empty files (thank you CA1) if len(r.content) == 0: msg = 'EmptyFileError: %s\n%s' % (download_url, traceback.format_exc()) return msg, r # test for and follow meta redirects r = follow_redirections(r, s) except: msg = 'DownloadingError: %s\n%s' % (download_url, traceback.format_exc()) print msg return msg, r # Success! return '', r
def _download(self, request_dict={}): """Methods for downloading the latest version of Site """ if self.method == 'POST': truncated_params = {} for k, v in self.parameters.iteritems(): truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]') logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) # Get the response. Disallow redirects so they throw an error s = requests.session() if self.method == 'GET': r = s.get(self.url, headers={'User-Agent': 'Juriscraper'}, **request_dict) elif self.method == 'POST': r = s.post(self.url, headers={'User-Agent': 'Juriscraper'}, data=self.parameters, **request_dict) elif self.method == 'LOCAL': mr = MockRequest(url=self.url) r = mr.get() # Provides a hook for inheriting objects to tweak the request object. self.tweak_request_object(r) # Throw an error if a bad status code is returned. r.raise_for_status() # If the encoding is iso-8859-1, switch it to cp1252 (a superset) if r.encoding == 'ISO-8859-1': r.encoding = 'cp1252' # Provide the response in the Site object self.r = r self.status = r.status_code # Grab the content text = self._clean_text(r.text) html_tree = html.fromstring(text) html_tree.rewrite_links(self._link_repl) return html_tree
def _download(self, request_dict={}): """Methods for downloading the latest version of Site """ if self.method == 'POST': truncated_params = {} for k, v in self.parameters.iteritems(): truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]') logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) # Set up verify here and remove it from request_dict so you don't send # it to s.get or s.post in two kwargs. if request_dict.get('verify') is not None: verify = request_dict['verify'] del request_dict['verify'] else: verify = certifi.where() # Get the response. Disallow redirects so they throw an error s = requests.session() s.mount('https://', self._get_adapter_instance()) if self.method == 'GET': r = s.get( self.url, headers={'User-Agent': 'Juriscraper'}, verify=verify, **request_dict ) elif self.method == 'POST': r = s.post( self.url, headers={'User-Agent': 'Juriscraper'}, verify=verify, data=self.parameters, **request_dict ) elif self.method == 'LOCAL': mr = MockRequest(url=self.url) r = mr.get() # Provides a hook for inheriting objects to tweak the request object. self.tweak_request_object(r) # Throw an error if a bad status code is returned. r.raise_for_status() # Tweak or set the encoding if needed r = self._set_encoding(r) # Provide the response in the Site object self.r = r self.status = r.status_code # Grab the content if 'json' in r.headers.get('content-type', ''): return r.json() else: text = self._clean_text(r.text) html_tree = self._make_html_tree(text) html_tree.rewrite_links(self._link_repl) return html_tree