def disambiguate_reporters( citations: List[Union[Citation, NonopinionCitation]] ) -> List[Union[Citation, NonopinionCitation]]: """Convert a list of citations to a list of unambiguous ones. Goal is to figure out: - citation.canonical_reporter - citation.lookup_index And there are a few things that can be ambiguous: - More than one variation. - More than one reporter for the key. - Could be an edition (or not) - All combinations of the above: - More than one variation. - More than one variation, with more than one reporter for the key. - More than one variation, with more than one reporter for the key, which is an edition. - More than one variation, which is an edition - ... For variants, we just need to sort out the canonical_reporter. If it's not possible to disambiguate the reporter, we simply have to drop it. """ unambiguous_citations = [] for citation in citations: # Only disambiguate citations with a reporter if not isinstance(citation, (FullCitation, ShortformCitation)): unambiguous_citations.append(citation) continue # Non-variant items (P.R.R., A.2d, Wash., etc.) if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None: citation.canonical_reporter = EDITIONS[citation.reporter] if len(REPORTERS[EDITIONS[citation.reporter]]) == 1: # Single reporter, easy-peasy. citation.lookup_index = 0 unambiguous_citations.append(citation) continue # Multiple books under this key, but which is correct? if citation.year: # attempt resolution by date possible_citations = [] rep_len = len(REPORTERS[EDITIONS[citation.reporter]]) for i in range(0, rep_len): if is_date_in_reporter( REPORTERS[EDITIONS[citation.reporter]][i] ["editions"], citation.year, ): possible_citations.append((citation.reporter, i)) if len(possible_citations) == 1: # We were able to identify only one hit # after filtering by year. citation.reporter = possible_citations[0][0] citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue # Try doing a variation of an edition. elif VARIATIONS_ONLY.get(citation.reporter) is not None: if len(VARIATIONS_ONLY[citation.reporter]) == 1: # Only one variation -- great, use it. citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[ citation.reporter][0]] cached_variation = citation.reporter citation.reporter = VARIATIONS_ONLY[citation.reporter][0] if len(REPORTERS[citation.canonical_reporter]) == 1: # It's a single reporter under a misspelled key. citation.lookup_index = 0 unambiguous_citations.append(citation) continue # Multiple reporters under a single misspelled key # (e.g. Wn.2d --> Wash --> Va Reports, Wash or # Washington Reports). if citation.year: # attempt resolution by date possible_citations = [] rep_can = len(REPORTERS[citation.canonical_reporter]) for i in range(0, rep_can): if is_date_in_reporter( REPORTERS[citation.canonical_reporter][i] ["editions"], citation.year, ): possible_citations.append((citation.reporter, i)) if len(possible_citations) == 1: # We were able to identify only one hit after # filtering by year. citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue # Attempt resolution by unique variation # (e.g. Cr. can only be Cranch[0]) possible_citations = [] reps = REPORTERS[citation.canonical_reporter] for i in range(0, len(reps)): for variation in REPORTERS[citation.canonical_reporter][i][ "variations"].items(): if variation[0] == cached_variation: possible_citations.append((variation[1], i)) if len(possible_citations) == 1: # We were able to find a single match after filtering # by variation. citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue else: # Multiple variations, deal with them. possible_citations = [] for reporter_key in VARIATIONS_ONLY[citation.reporter]: for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])): # This inner loop works regardless of the number of # reporters under the key. key = REPORTERS[EDITIONS[reporter_key]] if citation.year: cite_year = citation.year if is_date_in_reporter(key[i]["editions"], cite_year): possible_citations.append((reporter_key, i)) if len(possible_citations) == 1: # We were able to identify only one hit after filtering by # year. citation.canonical_reporter = EDITIONS[ possible_citations[0][0]] citation.reporter = possible_citations[0][0] citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue return unambiguous_citations
def get_date_filed(clean_html_tree, citations, case_path=None, court=None): path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]' # Get a reasonable date range based on reporters in the citations. reporter_keys = [citation.reporter for citation in citations] range_dates = [] for reporter_key in reporter_keys: for reporter in REPORTERS.get(EDITIONS.get(reporter_key)): try: range_dates.extend(reporter['editions'][reporter_key]) except KeyError: # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0]. pass if range_dates: start, end = min(range_dates) - timedelta( weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52) if end > now(): end = now() dates = [] for e in clean_html_tree.xpath(path): text = tostring(e, method='text', encoding='unicode') # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge # the word at, and anything after it. text = re.sub(' at .*', '', text) # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format. text = re.sub('\d{5,}', '', text) # The parser can't handle 'Sept.' so we tweak it. text = text.replace('Sept.', 'Sep.') # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08. re_match = re.search('\d{1,2}, \d{1,2}, \d{4}', text) if re_match: # These are always date argued, thus continue. continue # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12 # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/ re_match = re.search('\d{1,2}-\d{1,2}, \d{4}', text) if re_match: # These are always date argued, thus continue. continue # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this. if 'denied' in text.lower(): continue try: if range_dates: found = parse_dates.parse_dates(text, sane_start=start, sane_end=end) else: found = parse_dates.parse_dates(text, sane_end=now()) if found: dates.extend(found) except UnicodeEncodeError: # If it has unicode is crashes dateutil's parser, but is unlikely to be the date. pass # Get the date from our SCOTUS date table scotus_dates_found = [] if not dates and court == 'scotus': for citation in citations: try: # Scotus dates are in the form of a list, since a single citation can refer to several dates. found = scotus_dates["%s %s %s" % (citation.volume, citation.reporter, citation.page)] if len(found) == 1: scotus_dates_found.extend(found) except KeyError: pass if len(scotus_dates_found) == 1: dates = scotus_dates_found if not dates: # Try to grab the year from the citations, if it's the same in all of them. years = set([citation.year for citation in citations if citation.year]) if len(years) == 1: dates.append(datetime.datetime(list(years)[0], 1, 1)) if not dates: try: dates = fixes[case_path]['dates'] except KeyError: if 'input_dates' in DEBUG: #subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate() print ' No date found for: file://%s' % case_path input_date = raw_input(' What should be here (YYYY-MM-DD)? ') add_fix(case_path, { 'dates': [datetime.datetime.strptime(input_date, '%Y-%m-%d')] }) dates = [datetime.datetime.strptime(input_date, '%Y-%m-%d')] if 'log_bad_dates' in DEBUG: # Write the failed case out to file. with open('missing_dates.txt', 'a') as out: out.write('%s\n' % case_path) if dates: if 'date' in DEBUG: log_print(" Using date: %s of dates found: %s" % (max(dates), dates)) return max(dates) else: if 'date' in DEBUG: log_print(" No dates found") return []
def get_date_filed(clean_html_tree, citations, case_path=None, court=None): path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]' # Get a reasonable date range based on reporters in the citations. reporter_keys = [citation.reporter for citation in citations] range_dates = [] for reporter_key in reporter_keys: for reporter in REPORTERS.get(EDITIONS.get(reporter_key)): try: range_dates.extend(reporter["editions"][reporter_key]) except KeyError: # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0]. pass if range_dates: start, end = min(range_dates) - timedelta(weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52) if end > now(): end = now() dates = [] for e in clean_html_tree.xpath(path): text = tostring(e, method="text", encoding="unicode") # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge # the word at, and anything after it. text = re.sub(" at .*", "", text) # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format. text = re.sub("\d{5,}", "", text) # The parser can't handle 'Sept.' so we tweak it. text = text.replace("Sept.", "Sep.") # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08. re_match = re.search("\d{1,2}, \d{1,2}, \d{4}", text) if re_match: # These are always date argued, thus continue. continue # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12 # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/ re_match = re.search("\d{1,2}-\d{1,2}, \d{4}", text) if re_match: # These are always date argued, thus continue. continue # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this. if "denied" in text.lower(): continue try: if range_dates: found = parse_dates.parse_dates(text, sane_start=start, sane_end=end) else: found = parse_dates.parse_dates(text, sane_end=now()) if found: dates.extend(found) except UnicodeEncodeError: # If it has unicode is crashes dateutil's parser, but is unlikely to be the date. pass # Get the date from our SCOTUS date table scotus_dates_found = [] if not dates and court == "scotus": for citation in citations: try: # Scotus dates are in the form of a list, since a single citation can refer to several dates. found = scotus_dates["%s %s %s" % (citation.volume, citation.reporter, citation.page)] if len(found) == 1: scotus_dates_found.extend(found) except KeyError: pass if len(scotus_dates_found) == 1: dates = scotus_dates_found if not dates: # Try to grab the year from the citations, if it's the same in all of them. years = set([citation.year for citation in citations if citation.year]) if len(years) == 1: dates.append(datetime.datetime(list(years)[0], 1, 1)) if not dates: try: dates = fixes[case_path]["dates"] except KeyError: if "input_dates" in DEBUG: # subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate() print " No date found for: file://%s" % case_path input_date = raw_input(" What should be here (YYYY-MM-DD)? ") add_fix(case_path, {"dates": [datetime.datetime.strptime(input_date, "%Y-%m-%d")]}) dates = [datetime.datetime.strptime(input_date, "%Y-%m-%d")] if "log_bad_dates" in DEBUG: # Write the failed case out to file. with open("missing_dates.txt", "a") as out: out.write("%s\n" % case_path) if dates: if "date" in DEBUG: log_print(" Using date: %s of dates found: %s" % (max(dates), dates)) return max(dates) else: if "date" in DEBUG: log_print(" No dates found") return []
def disambiguate_reporters(citations): """Convert a list of citations to a list of unambiguous ones. Goal is to figure out: - citation.canonical_reporter - citation.lookup_index And there are a few things that can be ambiguous: - More than one variation. - More than one reporter for the key. - Could be an edition (or not) - All combinations of the above: - More than one variation. - More than one variation, with more than one reporter for the key. - More than one variation, with more than one reporter for the key, which is an edition. - More than one variation, which is an edition - ... For variants, we just need to sort out the canonical_reporter. If it's not possible to disambiguate the reporter, we simply have to drop it. """ unambiguous_citations = [] for citation in citations: # Non-variant items (P.R.R., A.2d, Wash., etc.) if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None: citation.canonical_reporter = EDITIONS[citation.reporter] if len(REPORTERS[EDITIONS[citation.reporter]]) == 1: # Single reporter, easy-peasy. citation.lookup_index = 0 unambiguous_citations.append(citation) continue else: # Multiple books under this key, but which is correct? if citation.year: # attempt resolution by date possible_citations = [] for i in range(0, len(REPORTERS[EDITIONS[citation.reporter]])): if is_date_in_reporter(REPORTERS[EDITIONS[citation.reporter]][i]["editions"], citation.year): possible_citations.append((citation.reporter, i)) if len(possible_citations) == 1: # We were able to identify only one hit after filtering by year. citation.reporter = possible_citations[0][0] citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue # Try doing a variation of an edition. elif VARIATIONS_ONLY.get(citation.reporter) is not None: if len(VARIATIONS_ONLY[citation.reporter]) == 1: # Only one variation -- great, use it. citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[citation.reporter][0]] cached_variation = citation.reporter citation.reporter = VARIATIONS_ONLY[citation.reporter][0] if len(REPORTERS[citation.canonical_reporter]) == 1: # It's a single reporter under a misspelled key. citation.lookup_index = 0 unambiguous_citations.append(citation) continue else: # Multiple reporters under a single misspelled key # (e.g. Wn.2d --> Wash --> Va Reports, Wash or # Washington Reports). if citation.year: # attempt resolution by date possible_citations = [] for i in range(0, len(REPORTERS[citation.canonical_reporter])): if is_date_in_reporter( REPORTERS[citation.canonical_reporter][i]["editions"], citation.year ): possible_citations.append((citation.reporter, i)) if len(possible_citations) == 1: # We were able to identify only one hit after filtering by year. citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue # Attempt resolution by unique variation (e.g. Cr. can only be Cranch[0]) possible_citations = [] for i in range(0, len(REPORTERS[citation.canonical_reporter])): for variation in REPORTERS[citation.canonical_reporter][i]["variations"].items(): if variation[0] == cached_variation: possible_citations.append((variation[1], i)) if len(possible_citations) == 1: # We were able to find a single match after filtering by variation. citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue else: # Multiple variations, deal with them. possible_citations = [] for reporter_key in VARIATIONS_ONLY[citation.reporter]: for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])): # This inner loop works regardless of the number of reporters under the key. if is_date_in_reporter(REPORTERS[EDITIONS[reporter_key]][i]["editions"], citation.year): possible_citations.append((reporter_key, i)) if len(possible_citations) == 1: # We were able to identify only one hit after filtering by year. citation.canonical_reporter = EDITIONS[possible_citations[0][0]] citation.reporter = possible_citations[0][0] citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue return unambiguous_citations