def _download(self, request_dict={}): """Methods for downloading the latest version of Site """ if self.method == 'POST': truncated_params = {} for k, v in self.parameters.iteritems(): truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]') logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) # Set up verify here and remove it from request_dict so you don't send # it to s.get or s.post in two kwargs. if request_dict.get('verify') is not None: verify = request_dict['verify'] del request_dict['verify'] else: verify = certifi.where() # Get the response. Disallow redirects so they throw an error s = requests.session() s.mount('https://', self._get_adapter_instance()) if self.method == 'GET': r = s.get(self.url, headers={'User-Agent': 'Juriscraper'}, verify=verify, **request_dict) elif self.method == 'POST': r = s.post(self.url, headers={'User-Agent': 'Juriscraper'}, verify=verify, data=self.parameters, **request_dict) elif self.method == 'LOCAL': mr = MockRequest(url=self.url) r = mr.get() # Provides a hook for inheriting objects to tweak the request object. self.tweak_request_object(r) # Throw an error if a bad status code is returned. r.raise_for_status() # Tweak or set the encoding if needed r = self._set_encoding(r) # Provide the response in the Site object self.r = r self.status = r.status_code # Grab the content if 'json' in r.headers.get('content-type', ''): return r.json() else: text = self._clean_text(r.text) html_tree = self._make_html_tree(text) html_tree.rewrite_links(self._link_repl) return html_tree
def scrape_court(site, binaries=False): """Calls the requested court(s), gets its content, then throws it away. Note that this is a very basic caller lacking important functionality, such as: - checking whether the HTML of the page has changed since last visited - checking whether the downloaded content is already in your data store - saving anything at all Nonetheless, this caller is useful for testing, and for demonstrating some basic pitfalls that a caller will run into. """ exceptions = defaultdict(list) for item in site: # First turn the download urls into a utf-8 byte string item_download_urls = item["download_urls"].encode("utf-8") # Percent encode URLs (this is a Python wart) download_url = six_parse.quote(item_download_urls, safe="%/:=&?~#+!$,;'@()*[]") if binaries: try: opener = six_request.build_opener() for cookie_dict in site.cookies: opener.addheaders.append(( "Cookie", "%s=%s" % (cookie_dict["name"], cookie_dict["value"]), )) data = opener.open(download_url).read() # test for empty files (thank you CA1) if len(data) == 0: exceptions["EmptyFileError"].append(download_url) v_print(3, "EmptyFileError: %s" % download_url) v_print(3, traceback.format_exc()) continue except Exception: exceptions["DownloadingError"].append(download_url) v_print(3, "DownloadingError: %s" % download_url) v_print(3, traceback.format_exc()) continue # Extract the data using e.g. antiword, pdftotext, etc., then # clean it up. data = extract_doc_content(data) data = site.cleanup_content(data) # Normally, you'd do your save routines here... v_print(1, "\nAdding new item:") for k, v in item.items(): if isinstance(v, six.text_type): value = trunc(v, 200, ellipsis="...") v_print(1, ' %s: "%s"' % (k, value.encode("utf-8"))) else: # Dates and such... v_print(1, " %s: %s" % (k, v)) v_print( 3, "\n%s: Successfully crawled %d items." % (site.court_id, len(site))) return {"count": len(site), "exceptions": exceptions}
def _download(self, request_dict={}): """Methods for downloading the latest version of Site """ if self.method == 'POST': truncated_params = {} for k, v in self.parameters.iteritems(): truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]') logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) # Get the response. Disallow redirects so they throw an error s = requests.session() if self.method == 'GET': r = s.get(self.url, headers={'User-Agent': 'Juriscraper'}, **request_dict) elif self.method == 'POST': r = s.post(self.url, headers={'User-Agent': 'Juriscraper'}, data=self.parameters, **request_dict) elif self.method == 'LOCAL': mr = MockRequest(url=self.url) r = mr.get() # Provides a hook for inheriting objects to tweak the request object. self.tweak_request_object(r) # Throw an error if a bad status code is returned. r.raise_for_status() # If the encoding is iso-8859-1, switch it to cp1252 (a superset) if r.encoding == 'ISO-8859-1': r.encoding = 'cp1252' # Provide the response in the Site object self.r = r self.status = r.status_code if r.encoding is None: # Requests detects the encoding when the item is GET'ed using # HTTP headers, and then when r.text is accessed, if the encoding # hasn't been set by that point. By setting the encoding here, we # ensure that it's done by cchardet, if it hasn't been done with # HTTP headers. This way it is done before r.text is accessed # (which would do it with vanilla chardet). This is a big # performance boon, and can be removed once requests is upgraded # (https://github.com/kennethreitz/requests/pull/814/) r.encoding = chardet.detect(r.content)['encoding'] # Grab the content text = self._clean_text(r.text) html_tree = html.fromstring(text) html_tree.rewrite_links(self._link_repl) return html_tree
def _download(self, request_dict={}): """Methods for downloading the latest version of Site """ if self.method == "POST": truncated_params = {} for k, v in self.parameters.iteritems(): truncated_params[k] = trunc(v, 50, ellipsis="...[truncated]") logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) # Set up verify here and remove it from request_dict so you don't send # it to s.get or s.post in two kwargs. if request_dict.get("verify") is not None: verify = request_dict["verify"] del request_dict["verify"] else: verify = certifi.where() # Get the response. Disallow redirects so they throw an error s = requests.session() s.mount("https://", self._get_adapter_instance()) if self.method == "GET": r = s.get(self.url, headers={"User-Agent": "Juriscraper"}, verify=verify, **request_dict) elif self.method == "POST": r = s.post( self.url, headers={"User-Agent": "Juriscraper"}, verify=verify, data=self.parameters, **request_dict ) elif self.method == "LOCAL": mr = MockRequest(url=self.url) r = mr.get() # Provides a hook for inheriting objects to tweak the request object. self.tweak_request_object(r) # Throw an error if a bad status code is returned. r.raise_for_status() # Tweak or set the encoding if needed r = self._set_encoding(r) # Provide the response in the Site object self.r = r self.status = r.status_code # Grab the content if "json" in r.headers.get("content-type", ""): return r.json() else: text = self._clean_text(r.text) html_tree = self._make_html_tree(text) html_tree.rewrite_links(self._link_repl) return html_tree
def scrape_court(site, binaries=False): """Calls the requested court(s), gets its content, then throws it away. Note that this is a very basic caller lacking important functionality, such as: - checking whether the HTML of the page has changed since last visited - checking whether the downloaded content is already in your data store - saving anything at all Nonetheless, this caller is useful for testing, and for demonstrating some basic pitfalls that a caller will run into. """ for item in site: # Percent encode URLs (this is a Python wart) download_url = urllib2.quote(item['download_urls'], safe="%/:=&?~#+!$,;'@()*[]") if binaries: try: opener = urllib2.build_opener() for cookie_dict in site.cookies: opener.addheaders.append(("Cookie", "%s=%s" % (cookie_dict['name'], cookie_dict['value']))) data = opener.open(download_url).read() # test for empty files (thank you CA1) if len(data) == 0: v_print(3, 'EmptyFileError: %s' % download_url) v_print(3, traceback.format_exc()) continue except Exception: v_print(3, 'DownloadingError: %s' % download_url) v_print(3, traceback.format_exc()) continue # Extract the data using e.g. antiword, pdftotext, etc., then # clean it up. data = extract_doc_content(data) data = site.cleanup_content(data) # Normally, you'd do your save routines here... v_print(1, '\nAdding new item:') for k, v in item.items(): if isinstance(v, six.text_type): value = trunc(v, 200, ellipsis='...') v_print(1, ' %s: "%s"' % (k, value.encode('utf-8'))) else: # Dates and such... v_print(1, ' %s: %s' % (k, v)) v_print(3, '\n%s: Successfully crawled %d items.' % (site.court_id, len(site)))
def _download(self, request_dict={}): """Methods for downloading the latest version of Site """ if self.method == 'POST': truncated_params = {} for k, v in self.parameters.iteritems(): truncated_params[k] = trunc(v, 50, elipsize=True, elipsis='...[truncated]') logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) # Get the response. Disallow redirects so they throw an error s = requests.session() if self.method == 'GET': r = s.get(self.url, headers={'User-Agent': 'Juriscraper'}, **request_dict) elif self.method == 'POST': r = s.post(self.url, headers={'User-Agent': 'Juriscraper'}, data=self.parameters, **request_dict) elif self.method == 'LOCAL': mr = MockRequest(url=self.url) r = mr.get() # Provides a hook for inheriting objects to tweak the request object. self.tweak_request_object(r) # Throw an error if a bad status code is returned. r.raise_for_status() # If the encoding is iso-8859-1, switch it to cp1252 (a superset) if r.encoding == 'ISO-8859-1': r.encoding = 'cp1252' # Provide the response in the Site object self.r = r self.status = r.status_code # Grab the content text = self._clean_text(r.text) html_tree = html.fromstring(text) html_tree.rewrite_links(self._link_repl) return html_tree
def scrape_court(site, binaries=False): """Calls the requested court(s), gets its content, then throws it away. Note that this is a very basic caller lacking important functionality, such as: - checking whether the HTML of the page has changed since last visited - checking whether the downloaded content is already in your data store - saving anything at all Nonetheless, this caller is useful for testing, and for demonstrating some basic pitfalls that a caller will run into. """ for item in site: # Percent encode URLs (this is a Python wart) download_url = urllib2.quote(item['download_urls'], safe="%/:=&?~#+!$,;'@()*[]") if binaries: try: opener = urllib2.build_opener() for cookie_dict in site.cookies: opener.addheaders.append(("Cookie", "%s=%s" % (cookie_dict['name'], cookie_dict['value']))) data = opener.open(download_url).read() # test for empty files (thank you CA1) if len(data) == 0: v_print(3, 'EmptyFileError: %s' % download_url) v_print(3, traceback.format_exc()) continue except Exception: v_print(3, 'DownloadingError: %s' % download_url) v_print(3, traceback.format_exc()) continue # Extract the data using e.g. antiword, pdftotext, etc., then # clean it up. data = extract_doc_content(data) data = site.cleanup_content(data) # Normally, you'd do your save routines here... v_print(1, 'Adding new item:') for k, v in item.items(): if type(v) == unicode: value = trunc(v, 200, ellipsis='...') v_print(1, ' %s: "%s"' % (k, value.encode('utf-8'))) else: # Dates and such... v_print(1, ' %s: %s' % (k, v)) v_print(3, '%s: Successfully crawled.' % site.court_id)
def _download(self, request_dict={}): """Download the latest version of Site""" self.downloader_executed = True if self.method == 'POST': truncated_params = {} for k, v in self.parameters.items(): truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]') logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) self._process_request_parameters(request_dict) if self.method == 'GET': self._request_url_get(self.url) elif self.method == 'POST': self._request_url_post(self.url) elif self.method == 'LOCAL': self._request_url_mock(self.url) self._post_process_response() return self._return_response_text_object()
def _download(self, request_dict={}): """Download the latest version of Site""" self.downloader_executed = True if self.method == 'POST': truncated_params = {} for k, v in self.parameters.items(): truncated_params[k] = trunc(v, 50, ellipsis='...[truncated]') logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) self._process_request_parameters(request_dict) if self.method == 'GET': self._request_url_get(self.url) elif self.method == 'POST': self._request_url_post(self.url) elif self.test_mode_enabled(): self._request_url_mock(self.url) self._post_process_response() return self._return_response_text_object()
def get_title(self, referer_id): """Get the HTML title for a page, trying again if failures occur. Idea here is that somebody will create a new page that embeds one of our maps. As soon as they do, we'll get an HTTP referer sent to us, which is great. Unfortunately, in many cases, the HTTP referer we receive is that of an in progress page or similar, NOT the page that's actually live. Thus, what we do is try the URL over and over, until we find success. If a title is found, the admins are notified. If not, the item is deleted (this is OK, however, b/c it'll be recreated if it should have existed). """ # Set the exponential back off in case we need it, starting at 15 minutes, # then 30, 60, 120... countdown = 15 * 60 * (2**self.request.retries) retried_exceeded = (self.request.retries >= self.max_retries) referer = Referer.objects.get(pk=referer_id) if blacklisted_url(referer.url): return try: r = requests.get( referer.url, headers={'User-Agent': "CourtListener"}, verify=False, # Integrity of a referer's referent is not important. ) except MissingSchema: return except TooManyRedirects: return try: r.raise_for_status() except HTTPError as exc: if retried_exceeded: # We're not wanted here. Maybe we'll have better luck another time. return raise self.retry(exc=exc, countdown=countdown) html_tree = html.fromstring(r.text) try: title = getattr(html_tree.xpath('//title')[0], 'text', '') if title is not None: title = title.strip() except IndexError: title = '' if title: referer.page_title = trunc( title, referer._meta.get_field('page_title').max_length, ) referer.save() if new_title_for_viz(referer): # Only send the email if we haven't seen this page title before for # this visualization. email = emails['referer_detected'] email_body = email['body'] % ( referer.url, referer.page_title, reverse('admin:visualizations_referer_change', args=(referer.pk, ))) send_mail(email['subject'], email_body, email['from'], email['to']) else: try: # Create an exception to catch. raise Exception("Couldn't get title from HTML") except Exception as exc: if retried_exceeded: # We couldn't get the title. Let it go. return raise self.retry(exc=exc, countdown=countdown)
def scrape_and_parse(): """Traverses the bulk data from public.resource.org, and puts them in the DB. Probably lots of ways to go about this, but I think the easiest will be the following: - look at the index page of all volumes, and follow all the links it has. - for each volume, look at its index page, and follow the link to all cases - for each case, collect information wisely. - put it all in the DB """ # begin by loading up the fix files into memory court_fix_dict, date_fix_dict, case_name_short_dict = load_fix_files() results = [] DEBUG = 4 # Set to False to disable automatic browser usage. Else, set to the # command you want to run, e.g. 'firefox' BROWSER = False court_fix_file = open('../logs/f2_court_fix_file.txt', 'a') date_fix_file = open('../logs/f2_date_fix_file.txt', 'a') case_name_short_fix_file = open('../logs/f2_short_case_name_fix_file.txt', 'a') vol_file = open('../logs/vol_file.txt', 'r+') case_file = open('../logs/case_file.txt', 'r+') url = "file://%s/Resource.org/F2/index.html" % INSTALL_ROOT openedURL = urllib2.urlopen(url) content = openedURL.read() openedURL.close() tree = fromstring(content) volumeLinks = tree.xpath('//table/tbody/tr/td[1]/a') try: i = int(vol_file.readline()) except ValueError: # the volume file is emtpy or otherwise failing. i = 0 vol_file.close() if DEBUG >= 1: print "Number of remaining volumes is: %d" % (len(volumeLinks) - i) # used later, needs a default value. saved_caseDate = None saved_court = None while i < len(volumeLinks): # we iterate over every case in the volume volumeURL = volumeLinks[i].text + "/index.html" volumeURL = urljoin(url, volumeURL) if DEBUG >= 1: print "Current volumeURL is: %s" % volumeURL openedVolumeURL = urllib2.urlopen(volumeURL) content = openedVolumeURL.read() volumeTree = fromstring(content) openedVolumeURL.close() caseLinks = volumeTree.xpath('//table/tbody/tr/td[1]/a') caseDates = volumeTree.xpath('//table/tbody/tr/td[2]') sha1Hashes = volumeTree.xpath('//table/tbody/tr/td[3]/a') # The following loads a serialized placeholder from disk. try: j = int(case_file.readline()) except ValueError: j = 0 case_file.close() while j < len(caseLinks): # iterate over each case, throwing it in the DB if DEBUG >= 1: print '' # like the scraper, we begin with the caseLink field (relative for # now, not absolute) caseLink = caseLinks[j].get('href') # sha1 is easy sha1Hash = sha1Hashes[j].text if DEBUG >= 4: print "SHA1 is: %s" % sha1Hash # using the caselink from above, and the volumeURL, we can get the # html absCaseLink = urljoin(volumeURL, caseLink) html = urllib2.urlopen(absCaseLink).read() htmlTree = fromstring(html) bodyContents = htmlTree.xpath('//body/*[not(@id="footer")]') body = "" bodyText = "" for element in bodyContents: body += tostring(element) try: bodyText += tostring(element, method='text') except UnicodeEncodeError: # Happens with odd characters. Simply pass this iteration. pass if DEBUG >= 5: print body print bodyText # need to figure out the court ID try: courtPs = htmlTree.xpath('//p[@class = "court"]') # Often the court ends up in the parties field. partiesPs = htmlTree.xpath("//p[@class= 'parties']") court = "" for courtP in courtPs: court += tostring(courtP).lower() for party in partiesPs: court += tostring(party).lower() except IndexError: court = check_fix_list(sha1Hash, court_fix_dict) if not court: print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() court = raw_input("Please input court name (e.g. \"First Circuit of Appeals\"): ").lower() court_fix_file.write("%s|%s\n" % (sha1Hash, court)) if ('first' in court) or ('ca1' == court): court = 'ca1' elif ('second' in court) or ('ca2' == court): court = 'ca2' elif ('third' in court) or ('ca3' == court): court = 'ca3' elif ('fourth' in court) or ('ca4' == court): court = 'ca4' elif ('fifth' in court) or ('ca5' == court): court = 'ca5' elif ('sixth' in court) or ('ca6' == court): court = 'ca6' elif ('seventh' in court) or ('ca7' == court): court = 'ca7' elif ('eighth' in court) or ('ca8' == court): court = 'ca8' elif ('ninth' in court) or ('ca9' == court): court = 'ca9' elif ("tenth" in court) or ('ca10' == court): court = 'ca10' elif ("eleventh" in court) or ('ca11' == court): court = 'ca11' elif ('columbia' in court) or ('cadc' == court): court = 'cadc' elif ('federal' in court) or ('cafc' == court): court = 'cafc' elif ('patent' in court) or ('ccpa' == court): court = 'ccpa' elif (('emergency' in court) and ('temporary' not in court)) or ('eca' == court): court = 'eca' elif ('claims' in court) or ('uscfc' == court): court = 'uscfc' else: # No luck extracting the court name. Try the fix file. court = check_fix_list(sha1Hash, court_fix_dict) if not court: # Not yet in the fix file. Check if it's a crazy ca5 case court = '' ca5courtPs = htmlTree.xpath('//p[@class = "center"]') for ca5courtP in ca5courtPs: court += tostring(ca5courtP).lower() if 'fifth circuit' in court: court = 'ca5' else: court = False if not court: # Still no luck. Ask for input, then append it to # the fix file. print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() court = raw_input("Unknown court. Input the court code to proceed successfully [%s]: " % saved_court) court = court or saved_court court_fix_file.write("%s|%s\n" % (sha1Hash, court)) saved_court = court court = Court.objects.get(pk=court) if DEBUG >= 4: print "Court is: %s" % court # next: west_cite, docket_number and caseName. Full casename is gotten later. west_cite = caseLinks[j].text docket_number = absCaseLink.split('.')[-2] caseName = caseLinks[j].get('title') caseName, precedential_status = exceptional_cleaner(caseName) cite, new = hasDuplicate(caseName, west_cite, docket_number) if cite.caseNameShort == '': # No luck getting the case name savedCaseNameShort = check_fix_list(sha1Hash, case_name_short_dict) if not savedCaseNameShort: print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() caseName = raw_input("Short casename: ") cite.caseNameShort = trunc(caseName, 100) cite.caseNameFull = caseName case_name_short_fix_file.write("%s|%s\n" % (sha1Hash, caseName)) else: # We got both the values from the save files. Use 'em. cite.caseNameShort = trunc(savedCaseNameShort, 100) cite.caseNameFull = savedCaseNameShort # The slug needs to be done here, b/c it is only done automatically # the first time the citation is saved, and this will be # at least the second. cite.slug = slugify(trunc(cite.caseNameShort, 75)) cite.save() if DEBUG >= 4: print "precedential_status: " + precedential_status print "west_cite: " + cite.west_cite print "caseName: " + cite.caseNameFull # date is kinda tricky...details here: # http://pleac.sourceforge.net/pleac_python/datesandtimes.html rawDate = caseDates[j].find('a') try: if rawDate is not None: # Special cases if sha1Hash == 'f0da421f117ef16223d7e61d1e4e5526036776e6': date_text = 'August 28, 1980' elif sha1Hash == '8cc192eaacd1c544b5e8ffbd751d9be84c311932': date_text = 'August 16, 1985' elif sha1Hash == 'd19bce155f72a9f981a12efabd760a35e1e7dbe7': date_text = 'October 12, 1979' elif sha1Hash == '9f7583cf0d46ddc9cad4e7943dd775f9e9ea99ff': date_text = 'July 30, 1980' elif sha1Hash == '211ea81a4ab4132483c483698d2a40f4366f5640': date_text = 'November 3, 1981' elif sha1Hash == 'eefb344034461e9c6912689677a32cd18381d5c2': date_text = 'July 28, 1983' else: date_text = rawDate.text try: caseDate = datetime.datetime(*time.strptime(date_text, "%B, %Y")[0:5]) except (ValueError, TypeError): caseDate = datetime.datetime(*time.strptime(date_text, "%B %d, %Y")[0:5]) else: # No value was found. Throw an exception. raise ValueError except: # No date provided. try: # Try to get it from the saved list caseDate = datetime.datetime(*time.strptime(check_fix_list(sha1Hash, date_fix_dict), "%B %d, %Y")[0:5]) except: caseDate = False if not caseDate: # Parse out the dates with debug set to false. try: dates = parse_dates(bodyText, False) except OverflowError: # Happens when we try to make a date from a very large number dates = [] try: first_date_found = dates[0] except IndexError: # No dates found. first_date_found = False if first_date_found == saved_caseDate: # High likelihood of date being correct. Use it. caseDate = saved_caseDate else: print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() print "Unknown date. Possible options are:" try: print " 1) %s" % saved_caseDate.strftime("%B %d, %Y") except AttributeError: # Happens on first iteration when saved_caseDate has no strftime attribute. try: saved_caseDate = dates[0] print " 1) %s" % saved_caseDate.strftime( "%B %d, %Y") except IndexError: # Happens when dates has no values. print " No options available." for k, date in enumerate(dates[0:4]): if date.year >= 1900: # strftime can't handle dates before 1900. print " %s) %s" % (k + 2, date.strftime("%B %d, %Y")) choice = raw_input("Enter the date or an option to proceed [1]: ") choice = choice or 1 if str(choice) == '1': # The user chose the default. Use the saved value from the last case caseDate = saved_caseDate elif choice in ['2', '3', '4', '5']: # The user chose an option between 2 and 5. Use it. caseDate = dates[int(choice) - 2] else: # The user typed a new date. Use it. caseDate = datetime.datetime(*time.strptime(choice, "%B %d, %Y")[0:5]) date_fix_file.write("%s|%s\n" % (sha1Hash, caseDate.strftime("%B %d, %Y"))) # Used during the next iteration as the default value saved_caseDate = caseDate if DEBUG >= 3: print "caseDate is: %s" % caseDate try: doc, created = Document.objects.get_or_create( sha1=sha1Hash, court=court) except MultipleObjectsReturned: # this shouldn't happen now that we're using SHA1 as the dup # check, but the old data is problematic, so we must catch this. created = False if created: # we only do this if it's new doc.html = body doc.sha1 = sha1Hash doc.download_url = "http://bulk.resource.org/courts.gov/c/F2/"\ + str(i + 178) + "/" + caseLink doc.date_filed = caseDate doc.source = "R" doc.precedential_status = precedential_status doc.citation = cite doc.save() if not created: # something is afoot. Throw a big error. print "Duplicate found at volume " + str(i + 1) + \ " and row " + str(j + 1) + "!!!!" print "Found document %s in the database with doc id of %d!" % (doc, doc.pk) exit(1) # save our location within the volume. j += 1 case_file = open('../logs/case_file.txt', 'w') case_file.write(str(j)) case_file.close() # save the last volume completed. i += 1 vol_file = open('../logs/vol_file.txt', 'w') vol_file.write(str(i)) vol_file.close() # Clear query cache, as it presents a memory leak db.reset_queries() return 0
def scrape_and_parse(): """Traverses the dumps from resource.org, and puts them in the DB. Probably lots of ways to go about this, but I think the easiest will be the following: - look at the index page of all volumes, and follow all the links it has. - for each volume, look at its index page, and follow the link to all cases - for each case, collect information wisely. - put it all in the DB """ # begin by loading up the fix files into memory court_fix_dict, date_fix_dict, case_name_short_dict = load_fix_files() results = [] DEBUG = 4 # Set to False to disable automatic browser usage. Else, set to the # command you want to run, e.g. 'firefox' BROWSER = False court_fix_file = open('../logs/f2_court_fix_file.txt', 'a') date_fix_file = open('../logs/f2_date_fix_file.txt', 'a') case_name_short_fix_file = open('../logs/f2_short_case_name_fix_file.txt', 'a') vol_file = open('../logs/vol_file.txt', 'r+') case_file = open('../logs/case_file.txt', 'r+') url = "file://%s/Resource.org/F2/index.html" % INSTALL_ROOT openedURL = urllib2.urlopen(url) content = openedURL.read() openedURL.close() tree = fromstring(content) volumeLinks = tree.xpath('//table/tbody/tr/td[1]/a') try: i = int(vol_file.readline()) except ValueError: # the volume file is emtpy or otherwise failing. i = 0 vol_file.close() if DEBUG >= 1: print "Number of remaining volumes is: %d" % (len(volumeLinks) - i) # used later, needs a default value. saved_caseDate = None saved_court = None while i < len(volumeLinks): # we iterate over every case in the volume volumeURL = volumeLinks[i].text + "/index.html" volumeURL = urljoin(url, volumeURL) if DEBUG >= 1: print "Current volumeURL is: %s" % volumeURL openedVolumeURL = urllib2.urlopen(volumeURL) content = openedVolumeURL.read() volumeTree = fromstring(content) openedVolumeURL.close() caseLinks = volumeTree.xpath('//table/tbody/tr/td[1]/a') caseDates = volumeTree.xpath('//table/tbody/tr/td[2]') sha1Hashes = volumeTree.xpath('//table/tbody/tr/td[3]/a') # The following loads a serialized placeholder from disk. try: j = int(case_file.readline()) except ValueError: j = 0 case_file.close() while j < len(caseLinks): # iterate over each case, throwing it in the DB if DEBUG >= 1: print '' # like the scraper, we begin with the caseLink field (relative for # now, not absolute) caseLink = caseLinks[j].get('href') # sha1 is easy sha1Hash = sha1Hashes[j].text if DEBUG >= 4: print "SHA1 is: %s" % sha1Hash # using the caselink from above, and the volumeURL, we can get the # html absCaseLink = urljoin(volumeURL, caseLink) html = urllib2.urlopen(absCaseLink).read() htmlTree = fromstring(html) bodyContents = htmlTree.xpath('//body/*[not(@id="footer")]') body = "" bodyText = "" for element in bodyContents: body += tostring(element) try: bodyText += tostring(element, method='text') except UnicodeEncodeError: # Happens with odd characters. Simply pass this iteration. pass if DEBUG >= 5: print body print bodyText # need to figure out the court ID try: courtPs = htmlTree.xpath('//p[@class = "court"]') # Often the court ends up in the parties field. partiesPs = htmlTree.xpath("//p[@class= 'parties']") court = "" for courtP in courtPs: court += tostring(courtP).lower() for party in partiesPs: court += tostring(party).lower() except IndexError: court = check_fix_list(sha1Hash, court_fix_dict) if not court: print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() court = raw_input("Please input court name (e.g. \"First Circuit of Appeals\"): ").lower() court_fix_file.write("%s|%s\n" % (sha1Hash, court)) if ('first' in court) or ('ca1' == court): court = 'ca1' elif ('second' in court) or ('ca2' == court): court = 'ca2' elif ('third' in court) or ('ca3' == court): court = 'ca3' elif ('fourth' in court) or ('ca4' == court): court = 'ca4' elif ('fifth' in court) or ('ca5' == court): court = 'ca5' elif ('sixth' in court) or ('ca6' == court): court = 'ca6' elif ('seventh' in court) or ('ca7' == court): court = 'ca7' elif ('eighth' in court) or ('ca8' == court): court = 'ca8' elif ('ninth' in court) or ('ca9' == court): court = 'ca9' elif ("tenth" in court) or ('ca10' == court): court = 'ca10' elif ("eleventh" in court) or ('ca11' == court): court = 'ca11' elif ('columbia' in court) or ('cadc' == court): court = 'cadc' elif ('federal' in court) or ('cafc' == court): court = 'cafc' elif ('patent' in court) or ('ccpa' == court): court = 'ccpa' elif (('emergency' in court) and ('temporary' not in court)) or ('eca' == court): court = 'eca' elif ('claims' in court) or ('uscfc' == court): court = 'uscfc' else: # No luck extracting the court name. Try the fix file. court = check_fix_list(sha1Hash, court_fix_dict) if not court: # Not yet in the fix file. Check if it's a crazy ca5 case court = '' ca5courtPs = htmlTree.xpath('//p[@class = "center"]') for ca5courtP in ca5courtPs: court += tostring(ca5courtP).lower() if 'fifth circuit' in court: court = 'ca5' else: court = False if not court: # Still no luck. Ask for input, then append it to # the fix file. print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() court = raw_input("Unknown court. Input the court code to proceed successfully [%s]: " % saved_court) court = court or saved_court court_fix_file.write("%s|%s\n" % (sha1Hash, court)) saved_court = court court = Court.objects.get(pk=court) if DEBUG >= 4: print "Court is: %s" % court # next: west_cite, docket_number and caseName. Full casename is gotten later. west_cite = caseLinks[j].text docket_number = absCaseLink.split('.')[-2] caseName = caseLinks[j].get('title') caseName, precedential_status = exceptional_cleaner(caseName) cite, new = hasDuplicate(caseName, west_cite, docket_number) if cite.caseNameShort == '': # No luck getting the case name savedCaseNameShort = check_fix_list(sha1Hash, case_name_short_dict) if not savedCaseNameShort: print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() caseName = raw_input("Short casename: ") cite.caseNameShort = trunc(caseName, 100) cite.caseNameFull = caseName case_name_short_fix_file.write("%s|%s\n" % (sha1Hash, caseName)) else: # We got both the values from the save files. Use 'em. cite.caseNameShort = trunc(savedCaseNameShort, 100) cite.caseNameFull = savedCaseNameShort # The slug needs to be done here, b/c it is only done automatically # the first time the citation is saved, and this will be # at least the second. cite.slug = trunc(slugify(cite.caseNameShort), 50) cite.save() if DEBUG >= 4: print "precedential_status: " + precedential_status print "west_cite: " + cite.west_cite print "docket_number: " + cite.docket_number print "caseName: " + cite.caseNameFull # date is kinda tricky...details here: # http://pleac.sourceforge.net/pleac_python/datesandtimes.html rawDate = caseDates[j].find('a') try: if rawDate is not None: # Special cases if sha1Hash == 'f0da421f117ef16223d7e61d1e4e5526036776e6': date_text = 'August 28, 1980' elif sha1Hash == '8cc192eaacd1c544b5e8ffbd751d9be84c311932': date_text = 'August 16, 1985' elif sha1Hash == 'd19bce155f72a9f981a12efabd760a35e1e7dbe7': date_text = 'October 12, 1979' elif sha1Hash == '9f7583cf0d46ddc9cad4e7943dd775f9e9ea99ff': date_text = 'July 30, 1980' elif sha1Hash == '211ea81a4ab4132483c483698d2a40f4366f5640': date_text = 'November 3, 1981' elif sha1Hash == 'eefb344034461e9c6912689677a32cd18381d5c2': date_text = 'July 28, 1983' else: date_text = rawDate.text try: caseDate = datetime.datetime(*time.strptime(date_text, "%B, %Y")[0:5]) except ValueError, TypeError: caseDate = datetime.datetime(*time.strptime(date_text, "%B %d, %Y")[0:5]) else: # No value was found. Throw an exception. raise ValueError except:
def get_title(self, referer_id): """Get the HTML title for a page, trying again if failures occur. Idea here is that somebody will create a new page that embeds one of our maps. As soon as they do, we'll get an HTTP referer sent to us, which is great. Unfortunately, in many cases, the HTTP referer we receive is that of an in progress page or similar, NOT the page that's actually live. Thus, what we do is try the URL over and over, until we find success. If a title is found, the admins are notified. If not, the item is deleted (this is OK, however, b/c it'll be recreated if it should have existed). """ # Set the exponential back off in case we need it, starting at 15 minutes, # then 30, 60, 120... countdown = 15 * 60 * (2 ** self.request.retries) retried_exceeded = (self.request.retries >= self.max_retries) referer = Referer.objects.get(pk=referer_id) if blacklisted_url(referer.url): return try: r = requests.get( referer.url, headers={'User-Agent': "CourtListener"}, verify=False, # Integrity of a referer's referent is not important. ) except MissingSchema: return except TooManyRedirects: return try: r.raise_for_status() except HTTPError as exc: if retried_exceeded: # We're not wanted here. Maybe we'll have better luck another time. return raise self.retry(exc=exc, countdown=countdown) html_tree = html.fromstring(r.text) try: title = getattr(html_tree.xpath('//title')[0], 'text', '') if title is not None: title = title.strip() except IndexError: title = '' if title: referer.page_title = trunc( title, referer._meta.get_field('page_title').max_length, ) referer.save() if new_title_for_viz(referer): # Only send the email if we haven't seen this page title before for # this visualization. email = emails['referer_detected'] email_body = email['body'] % (referer.url, referer.page_title, reverse( 'admin:visualizations_referer_change', args=(referer.pk,) )) send_mail(email['subject'], email_body, email['from'], email['to']) else: try: # Create an exception to catch. raise Exception("Couldn't get title from HTML") except Exception as exc: if retried_exceeded: # We couldn't get the title. Let it go. return raise self.retry(exc=exc, countdown=countdown)
def scrape_and_parse(): """Traverses the bulk data from public.resource.org, and puts them in the DB. Probably lots of ways to go about this, but I think the easiest will be the following: - look at the index page of all volumes, and follow all the links it has. - for each volume, look at its index page, and follow the link to all cases - for each case, collect information wisely. - put it all in the DB """ # begin by loading up the fix files into memory court_fix_dict, date_fix_dict, case_name_short_dict = load_fix_files() results = [] DEBUG = 4 # Set to False to disable automatic browser usage. Else, set to the # command you want to run, e.g. 'firefox' BROWSER = False court_fix_file = open('../logs/f2_court_fix_file.txt', 'a') date_fix_file = open('../logs/f2_date_fix_file.txt', 'a') case_name_short_fix_file = open('../logs/f2_short_case_name_fix_file.txt', 'a') vol_file = open('../logs/vol_file.txt', 'r+') case_file = open('../logs/case_file.txt', 'r+') url = "file://%s/Resource.org/F2/index.html" % INSTALL_ROOT openedURL = urllib2.urlopen(url) content = openedURL.read() openedURL.close() tree = fromstring(content) volumeLinks = tree.xpath('//table/tbody/tr/td[1]/a') try: i = int(vol_file.readline()) except ValueError: # the volume file is emtpy or otherwise failing. i = 0 vol_file.close() if DEBUG >= 1: print "Number of remaining volumes is: %d" % (len(volumeLinks) - i) # used later, needs a default value. saved_caseDate = None saved_court = None while i < len(volumeLinks): # we iterate over every case in the volume volumeURL = volumeLinks[i].text + "/index.html" volumeURL = urljoin(url, volumeURL) if DEBUG >= 1: print "Current volumeURL is: %s" % volumeURL openedVolumeURL = urllib2.urlopen(volumeURL) content = openedVolumeURL.read() volumeTree = fromstring(content) openedVolumeURL.close() caseLinks = volumeTree.xpath('//table/tbody/tr/td[1]/a') caseDates = volumeTree.xpath('//table/tbody/tr/td[2]') sha1Hashes = volumeTree.xpath('//table/tbody/tr/td[3]/a') # The following loads a serialized placeholder from disk. try: j = int(case_file.readline()) except ValueError: j = 0 case_file.close() while j < len(caseLinks): # iterate over each case, throwing it in the DB if DEBUG >= 1: print '' # like the scraper, we begin with the caseLink field (relative for # now, not absolute) caseLink = caseLinks[j].get('href') # sha1 is easy sha1Hash = sha1Hashes[j].text if DEBUG >= 4: print "SHA1 is: %s" % sha1Hash # using the caselink from above, and the volumeURL, we can get the # html absCaseLink = urljoin(volumeURL, caseLink) html = urllib2.urlopen(absCaseLink).read() htmlTree = fromstring(html) bodyContents = htmlTree.xpath('//body/*[not(@id="footer")]') body = "" bodyText = "" for element in bodyContents: body += tostring(element) try: bodyText += tostring(element, method='text') except UnicodeEncodeError: # Happens with odd characters. Simply pass this iteration. pass if DEBUG >= 5: print body print bodyText # need to figure out the court ID try: courtPs = htmlTree.xpath('//p[@class = "court"]') # Often the court ends up in the parties field. partiesPs = htmlTree.xpath("//p[@class= 'parties']") court = "" for courtP in courtPs: court += tostring(courtP).lower() for party in partiesPs: court += tostring(party).lower() except IndexError: court = check_fix_list(sha1Hash, court_fix_dict) if not court: print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() court = raw_input( "Please input court name (e.g. \"First Circuit of Appeals\"): " ).lower() court_fix_file.write("%s|%s\n" % (sha1Hash, court)) if ('first' in court) or ('ca1' == court): court = 'ca1' elif ('second' in court) or ('ca2' == court): court = 'ca2' elif ('third' in court) or ('ca3' == court): court = 'ca3' elif ('fourth' in court) or ('ca4' == court): court = 'ca4' elif ('fifth' in court) or ('ca5' == court): court = 'ca5' elif ('sixth' in court) or ('ca6' == court): court = 'ca6' elif ('seventh' in court) or ('ca7' == court): court = 'ca7' elif ('eighth' in court) or ('ca8' == court): court = 'ca8' elif ('ninth' in court) or ('ca9' == court): court = 'ca9' elif ("tenth" in court) or ('ca10' == court): court = 'ca10' elif ("eleventh" in court) or ('ca11' == court): court = 'ca11' elif ('columbia' in court) or ('cadc' == court): court = 'cadc' elif ('federal' in court) or ('cafc' == court): court = 'cafc' elif ('patent' in court) or ('ccpa' == court): court = 'ccpa' elif (('emergency' in court) and ('temporary' not in court)) or ('eca' == court): court = 'eca' elif ('claims' in court) or ('uscfc' == court): court = 'uscfc' else: # No luck extracting the court name. Try the fix file. court = check_fix_list(sha1Hash, court_fix_dict) if not court: # Not yet in the fix file. Check if it's a crazy ca5 case court = '' ca5courtPs = htmlTree.xpath('//p[@class = "center"]') for ca5courtP in ca5courtPs: court += tostring(ca5courtP).lower() if 'fifth circuit' in court: court = 'ca5' else: court = False if not court: # Still no luck. Ask for input, then append it to # the fix file. print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() court = raw_input( "Unknown court. Input the court code to proceed successfully [%s]: " % saved_court) court = court or saved_court court_fix_file.write("%s|%s\n" % (sha1Hash, court)) saved_court = court court = Court.objects.get(pk=court) if DEBUG >= 4: print "Court is: %s" % court # next: west_cite, docket_number and caseName. Full casename is gotten later. west_cite = caseLinks[j].text docket_number = absCaseLink.split('.')[-2] caseName = caseLinks[j].get('title') caseName, precedential_status = exceptional_cleaner(caseName) cite, new = hasDuplicate(caseName, west_cite, docket_number) if cite.caseNameShort == '': # No luck getting the case name savedCaseNameShort = check_fix_list(sha1Hash, case_name_short_dict) if not savedCaseNameShort: print absCaseLink if BROWSER: subprocess.Popen([BROWSER, absCaseLink], shell=False).communicate() caseName = raw_input("Short casename: ") cite.caseNameShort = trunc(caseName, 100) cite.caseNameFull = caseName case_name_short_fix_file.write("%s|%s\n" % (sha1Hash, caseName)) else: # We got both the values from the save files. Use 'em. cite.caseNameShort = trunc(savedCaseNameShort, 100) cite.caseNameFull = savedCaseNameShort # The slug needs to be done here, b/c it is only done automatically # the first time the citation is saved, and this will be # at least the second. cite.slug = slugify(trunc(cite.caseNameShort, 75)) cite.save() if DEBUG >= 4: print "precedential_status: " + precedential_status print "west_cite: " + cite.west_cite print "caseName: " + cite.caseNameFull # date is kinda tricky...details here: # http://pleac.sourceforge.net/pleac_python/datesandtimes.html rawDate = caseDates[j].find('a') try: if rawDate is not None: # Special cases if sha1Hash == 'f0da421f117ef16223d7e61d1e4e5526036776e6': date_text = 'August 28, 1980' elif sha1Hash == '8cc192eaacd1c544b5e8ffbd751d9be84c311932': date_text = 'August 16, 1985' elif sha1Hash == 'd19bce155f72a9f981a12efabd760a35e1e7dbe7': date_text = 'October 12, 1979' elif sha1Hash == '9f7583cf0d46ddc9cad4e7943dd775f9e9ea99ff': date_text = 'July 30, 1980' elif sha1Hash == '211ea81a4ab4132483c483698d2a40f4366f5640': date_text = 'November 3, 1981' elif sha1Hash == 'eefb344034461e9c6912689677a32cd18381d5c2': date_text = 'July 28, 1983' else: date_text = rawDate.text try: caseDate = datetime.datetime( *time.strptime(date_text, "%B, %Y")[0:5]) except ValueError, TypeError: caseDate = datetime.datetime( *time.strptime(date_text, "%B %d, %Y")[0:5]) else: # No value was found. Throw an exception. raise ValueError except: