def _get_case_names(self): path = "//table/tr/td/a[1]" case_names = [ clean_if_py3("".join(txt.itertext()).strip()) for txt in self.html.xpath(path) ] return case_names
def _return_dates(html_tree): path = "//*[contains(concat(' ',@id,' '),' wfHeader') and not(contains(., 'Iowa'))]/text()" dates = [] text = clean_if_py3(html_tree.xpath(path)[0]) case_date = date.fromtimestamp(time.mktime(time.strptime(text.strip(), '%B %d, %Y'))) dates.extend([case_date] * int(html_tree.xpath("count(//*[contains(concat(' ',@id,' '),' wfLabel')])"))) return dates
def _get_judges(self): path = '//div[@class="feed-item-body"]' judges = [] splitters = [ 'Signed by Chief Judge', 'Signed by Judge', 'Signed by Chief Special Master', # Vaccine courts have odd names for judges 'Signed by Special Master', ] for e in self.html.xpath(path): t = html.tostring(e, method='text', encoding='unicode') t = clean_if_py3(t).split('Keywords:')[0] for splitter in splitters: judge_parts = t.rsplit(splitter) if len(judge_parts) == 1: # No splits found... judge = '' continue else: judge = judge_parts[1] break # Often the text looks like: 'Judge Susan G. Braden. (jt1) Copy to parties.' In that case we only # want the name, not the rest. length_of_match = 2 m = re.search(r'[a-z]{%s}\.' % length_of_match, judge) # Two lower case letters followed by a period if m: judge = judge[:m.start() + length_of_match] else: judge = '' judge.strip('.') judges.append(judge) return judges
def _get_case_dates(self): path = "{base}/following::td[4]//text()".format(base=self.base_path) dates = [] for s in self.html.xpath(path): dates.append( datetime.strptime(clean_if_py3(s).strip(), "%m/%d/%Y").date()) return dates
def _get_judges(self): path = '//div[@class="feed-item-body"]' judges = [] splitters = [ 'Signed by Chief Judge', 'Signed by Judge', 'Signed by Chief Special Master', # Vaccine courts have odd names for judges 'Signed by Special Master', ] for e in self.html.xpath(path): t = html.tostring(e, method='text', encoding='unicode') t = clean_if_py3(t).split('Keywords:')[0] for splitter in splitters: judge_parts = t.rsplit(splitter) if len(judge_parts) == 1: # No splits found... judge = '' continue else: judge = judge_parts[1] break # Often the text looks like: 'Judge Susan G. Braden. (jt1) Copy to parties.' In that case we only # want the name, not the rest. length_of_match = 2 m = re.search(r'[a-z]{%s}\.' % length_of_match, judge) # Two lower case letters followed by a period if m: judge = judge[:m.start() + length_of_match] else: judge = '' judge.strip('.') judges.append(judge) return judges
def _get_date_object_from_string(self, date_string): date_string = ( clean_if_py3(date_string) .strip() .replace(" ,", ", ") .replace("2104", "2014") ) return convert_date_string(date_string)
def _get_case_dates(self): case_dates = [] for txt in [item.xpath('./description/text()')[0] for item in self.items]: # I can't see it, but there's apparently whitespace or a newline # at the end of these dates that has to be removed or we error out. case_date = clean_if_py3(txt).split('about ', 1)[1].strip() case_dates.append(datetime.strptime(case_date, '%m/%d/%Y').date()) return case_dates
def _get_case_dates(self): dates = [] for txt in self.html.xpath('//li[@class="releaseDate"]/text()'): # Release Date: 2/22/2013 --> 2/22/2013 txt = clean_if_py3(txt).strip().split(' ')[2] dates.append(date.fromtimestamp(time.mktime(time.strptime( txt.strip(), '%m/%d/%Y')))) return dates
def _return_case_names(html_tree): path = "//th//a[contains(., '/')]/text()" case_names = [] for name in html_tree.xpath(path): name = clean_if_py3(name).strip() if name: case_names.append(name) return case_names
def _get_case_dates(self): path = '//tr/td[3]/text()' case_dates = [] for date_string in self.html.xpath(path): date_string = clean_if_py3(date_string).strip() if date_string: case_dates.append(datetime.strptime(date_string, '%Y/%m/%d').date()) return case_dates
def _get_docket_numbers(self): if "ASBCA Number" not in self.columns: return None path = "//table/tr[td/a]/td[%d]/text()" % self.columns["ASBCA Number"] return [ ("ASBCA No. " + clean_if_py3(txt).strip()) for txt in self.html.xpath(path) ]
def _get_docket_numbers(self): path = '//tr/td[2]//text()' docket_numbers = [] for s in self.html.xpath(path): s = clean_if_py3(s).strip() if s: docket_numbers.append(s) return docket_numbers
def _return_case_names(html_tree): path = "//th//a[contains(., '/')]/text()" case_names = [] for name in html_tree.xpath(path): name = clean_if_py3(name).strip() if name: case_names.append(name) return case_names
def _get_case_dates(self): case_dates = [] for el in self.html.xpath(self.date_path): date_string = clean_if_py3(str(el)).strip() if date_string: case_dates.append(datetime.strptime(date_string, '%b %d, %Y').date()) return case_dates
def _sanitize_docket_name_text(self, text): text = clean_if_py3(text).strip() first_word = text.split()[0] # Replace en dash typo with proper hyphen so regex parses properly en_dash = b'\xe2\x80\x93'.decode('utf-8') first_word_sanitized = first_word.replace(en_dash, '-') return text.replace(first_word, first_word_sanitized)
def _get_case_dates(self): path = '//item/description/b/text()' dates = [] for t in self.html.xpath(path): # t looks like: [Argued:91-1-2015] t = re.sub(r'[\[\]\s]', '', t) # Strip out [ and ]. date_string = clean_if_py3(t).split(':', 1)[1].strip() # Then get the date part. dates.append(datetime.strptime(date_string, '%m-%d-%Y').date()) return dates
def _get_docket_numbers(self): path = '//tr[contains(concat(" ", @class, " "), " sc-opinion ")]/td[1]//text()[normalize-space() != ""]' docket_numbers = [] for el in self.html.xpath(path): text = clean_if_py3(str(el)).strip() if text: docket_numbers.append(text) return docket_numbers
def _extract_case_names_from_sub_page(self, html_tree): regex = r'(?:%s)' % self.base_anchor_regex path = "{base}//text()".format(base=self.base_anchor_path) case_names = [] for el in html_tree.xpath(path): txt = clean_if_py3(el).strip() if txt: case_names.append(re.search(regex, txt).group(1)) return case_names
def _extract_docket_numbers_from_sub_page(self, html_tree): regex = '(%s)' % self.base_anchor_regex path = "{base}//text()".format(base=self.base_anchor_path) docket_numbers = [] for el in html_tree.xpath(path): txt = clean_if_py3(el).strip() if txt: docket_numbers.append(re.search(regex, txt).group(1)) return docket_numbers
def _get_case_dates(self): case_dates = [] for el in self.html.xpath(self.date_path): date_string = clean_if_py3(str(el)).strip() if date_string: case_dates.append( datetime.strptime(date_string, '%b %d, %Y').date()) return case_dates
def _get_dispositions(self): disps = [] for el in self.html.xpath("//ul[contains(@class, 'odd') or contains(@class, 'even')]"): try: s = el.xpath('//li[@class="caseNature"]/text()') disps.append(clean_if_py3(s[0]).strip().split(' ', 2)[2]) except IndexError: disps.append('') return disps
def _get_case_dates(self): dates = [] for txt in self.html.xpath('//li[@class="releaseDate"]/text()'): # Release Date: 2/22/2013 --> 2/22/2013 txt = clean_if_py3(txt).strip().split(" ")[2] dates.append( date.fromtimestamp( time.mktime(time.strptime(txt.strip(), "%m/%d/%Y")))) return dates
def _get_case_names(self): path = '//tr/td[4]/text()' names = [] for s in self.html.xpath(path): s = clean_if_py3(s) if s.strip(): names.append(s) logger.info(str(len(names))) return names
def _get_summaries(self): summaries = [] path = '//div[@class="feed-item-body"]' for e in self.html.xpath(path): s = html.tostring(e, method='text', encoding='unicode') s = clean_if_py3(s).split('Keywords:')[0] summaries.append(s) return summaries
def _get_case_names(self): case_names = [] path = '%s/td[3]' % self.base_path for cell in self.html.xpath(path): name_string = html.tostring(cell, method='text', encoding='unicode') name_string = clean_if_py3(name_string).strip() if name_string: case_names.append(name_string) return case_names
def _get_summaries(self): summaries = [] path = '//div[@class="feed-item-body"]' for e in self.html.xpath(path): s = html.tostring(e, method='text', encoding='unicode') s = clean_if_py3(s).split('Keywords:')[0] summaries.append(s) return summaries
def parse_column_names(self): # Lookup column names and save them for later self.columns = dict() path = "//table/tr[1]/td" i = 1 for column in self.html.xpath(path): colname = clean_if_py3(''.join(column.itertext())).strip() self.columns[colname] = i i += 1 return self.columns
def _get_dispositions(self): disps = [] for el in self.html.xpath( "//ul[contains(@class, 'odd') or contains(@class, 'even')]"): try: s = el.xpath('//li[@class="caseNature"]/text()') disps.append(clean_if_py3(s[0]).strip().split(" ", 2)[2]) except IndexError: disps.append("") return disps
def _get_lower_courts(self): lower_courts = [] for el in self.html.xpath("//ul[contains(@class, 'odd') or contains(@class, 'even')]"): try: s = el.xpath('//li[@class="casedetailsleft"]' '//li[@class="lowerCourt"]/text()') lower_courts.append(titlecase(clean_if_py3(s[0]).strip().split(' ', 2)[2])) except IndexError: lower_courts.append('') return lower_courts
def _return_docket_numbers(html_tree): path = "//th//a[contains(., '-')]/*/text() | //th//a[contains(text(),'-')]/text()" dockets = [] for text in list(html_tree.xpath(path)): # sanitize text and extract docket text = clean_if_py3(text).split('/')[0].strip() docket = ''.join(text.split()) if re.match(r'^\w+-\d+$', docket): dockets.append(docket) return dockets
def _get_lower_court_numbers(self): nums = [] for el in self.html.xpath("//ul[contains(@class, 'odd') or contains(@class, 'even')]"): try: s = el.xpath('//li[@class = "casedetailsright"]' '//li[@class = "lowerCourt"]/text()') nums.append(clean_if_py3(s[0]).strip().split('No. ')[1]) except IndexError: nums.append('') return nums
def _get_case_dates(self): dates = [] for date_string in self.html.xpath('//dl/dd/dd/dd/text()'): date_string = clean_if_py3(date_string).strip() if date_string == '': dates.append('') else: dates.append(date.fromtimestamp( time.mktime(time.strptime(date_string, '%m/%d/%y')))) return dates
def _return_docket_numbers(html_tree): path = "//th//a[contains(., '-')]/*/text() | //th//a[contains(text(),'-')]/text()" dockets = [] for text in list(html_tree.xpath(path)): # sanitize text and extract docket text = clean_if_py3(text).split('/')[0].strip() docket = ''.join(text.split()) if re.match(r'^\w+-\d+$', docket): dockets.append(docket) return dockets
def _get_case_dates(self): dates = [] for date_string in self.html.xpath('//dl/dd/dd/dd/text()'): date_string = clean_if_py3(date_string).strip() if date_string == '': dates.append('') else: dates.append( date.fromtimestamp( time.mktime(time.strptime(date_string, '%m/%d/%y')))) return dates
def _get_case_names(self): case_names = [] path = "%s/td[3]" % self.path_base for cell in self.html.xpath(path): name_string = html.tostring(cell, method="text", encoding="unicode") name_string = clean_if_py3(name_string).strip() if name_string: case_names.append(name_string) return case_names
def _get_lower_court_numbers(self): nums = [] for el in self.html.xpath( "//ul[contains(@class, 'odd') or contains(@class, 'even')]"): try: s = el.xpath('//li[@class = "casedetailsright"]' '//li[@class = "lowerCourt"]/text()') nums.append(clean_if_py3(s[0]).strip().split("No. ")[1]) except IndexError: nums.append("") return nums
def _get_case_dates(self): dates = [] for date_string in self.html.xpath("//dl/dd/dd/dd/text()"): date_string = clean_if_py3(date_string).strip() if date_string == "": dates.append("") else: dates.append( date.fromtimestamp( time.mktime(time.strptime(date_string, "%m/%d/%y")))) return dates
def _get_lower_courts(self): lower_courts = [] for el in self.html.xpath( "//ul[contains(@class, 'odd') or contains(@class, 'even')]"): try: s = el.xpath('//li[@class="casedetailsleft"]' '//li[@class="lowerCourt"]/text()') lower_courts.append( titlecase(clean_if_py3(s[0]).strip().split(" ", 2)[2])) except IndexError: lower_courts.append("") return lower_courts
def _get_docket_numbers(self): path = '{base}//text()[normalize-space(.)]'.format( base=self.base_path.format( table_number=self.table_number, i=2 + self.xpath_adjustment, ), ) docket_numbers = [] for el in self.html.xpath(path): text = clean_if_py3(str(el)).strip() if text: docket_numbers.append(text) return docket_numbers
def _get_case_dates(self): case_dates = [] path = '%s/td[1]' % self.base_path for cell in self.html.xpath(path): date_string = html.tostring(cell, method='text', encoding='unicode') date_string = clean_if_py3(date_string).strip() if date_string: if six.PY2: date_string = date_string.encode('ascii', 'ignore') date_string = date_string.replace('Sept ', 'Sep ') # GIGO! (+1 by arderyp) case_dates.append(convert_date_string(date_string)) return case_dates
def _get_case_dates(self): path = "//item/description/b/text()" dates = [] for t in self.html.xpath(path): # t looks like: [Argued:91-1-2015] t = re.sub(r"[\[\]\s]", "", t) # Strip out [ and ]. date_string = clean_if_py3(t).split(":", 1)[1].strip() # sometimes there is a type like: [Argued:91-1-2015mp3] # such as in ca1_example_2.xml date_string = date_string.replace("mp3", "") dates.append(convert_date_string(date_string)) return dates
def _get_docket_numbers(self): docket_numbers = [] for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'): t = clean_if_py3(t) if t.strip(): # If there is something other than whitespace... if not isinstance(t, six.string_types): t = str(t, encoding='utf-8') if u' • ' in t: t = t.split(u' • ')[0].strip() docket_numbers.append(t) return docket_numbers
def _get_docket_numbers(self): path = '{base}//text()[normalize-space(.)]'.format( base=self.base_path.format( table_number=self.table_number, i=2 + self.xpath_adjustment, ), ) docket_numbers = [] for el in self.html.xpath(path): text = clean_if_py3(str(el)).strip() if text: docket_numbers.append(text) return docket_numbers
def _get_case_names(self): case_names = [] for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'): t = ' '.join(clean_if_py3(t).split()) # Normalize whitespace if t.strip(): # If there is something other than whitespace... if not isinstance(t, six.string_types): t = str(t, encoding='utf-8') if u' • ' in t: t = t.split(u' • ')[1].strip() t = titlecase(t.lower()) case_names.append(t) return case_names
def get_nth_table_cell_data(self, n, href=False, link_text=False): path = '//table/tr/td[%d]' % n if href: path += '/a/@href' elif link_text: path += '/a/text()' else: path += '/text()' results = [] for data in self.html.xpath(path): data = clean_if_py3(data).strip() if data: results.append(data) return results
def _get_download_urls(self): download_urls = [] for element in self.html.xpath(self.base_path): url = '' for href in element.xpath('./td[5]//@href'): href = clean_if_py3(href) # Check for newer standard href match = self.href_standard.match(href) if match: url = match.group(0) break # Check for presence of legacy JavaScript href matches = self.href_js.findall(href) if matches: url = url + matches[0] if url: download_urls.append(url) return download_urls
def _get_neutral_citations(self): neutral_path = '{base}//text()'.format( base=self.base_path.format( table_number=self.table_number, i=1 + self.xpath_adjustment, ), ) date_strings = [] for el in self.html.xpath(self.date_path): date_string = clean_if_py3(str(el)).strip() if date_string: date_strings.append(date_string) neutral_citations = [] for neutral_number, \ date_string in zip( self.html.xpath(neutral_path), date_strings): year = datetime.strptime(date_string.strip(), '%b %d, %Y').year neutral_citations.append('{year} NV {num}'.format(year=year, num=neutral_number)) return neutral_citations
def _return_docket_numbers(html_tree): path = "//*[contains(concat(' ',@id,' '),' wfLabel')]/preceding::tr[2]/td[1]/a/text()" return [clean_if_py3(re.sub(r'Nos?.', '', e).strip()) for e in html_tree.xpath(path)]
def _get_case_dates(self): path = "{base}/following::td[4]//text()".format(base=self.base_path) dates = [] for s in self.html.xpath(path): dates.append(datetime.strptime(clean_if_py3(s).strip(), '%m/%d/%Y').date()) return dates
def _get_case_names(self): case_names = [] for txt in [item.xpath('./title/text()')[0] for item in self.items]: case_name = clean_if_py3(txt).split(': ', 1)[1] case_names.append(case_name) return case_names
def _get_docket_numbers(self): docket_numbers = [] for txt in [item.xpath('./title/text()')[0] for item in self.items]: docket_number = clean_if_py3(txt).split(': ', 1)[0] docket_numbers.append(docket_number) return docket_numbers