def test_that_all_dates_are_converted_to_dates_not_strings(self): """Do we properly make the ISO-8601 date strings into Python dates?""" for reporter_name, reporter_list in REPORTERS.iteritems(): # reporter_name == "A." # reporter_list == [ # {'name': 'Atlantic Reporter', 'editions': ...}, # {'name': 'Aldo's Reporter', 'editions': ...} # ] for reporter_dict in reporter_list: # reporter_dict == {'name': 'Atlantic Reporter'} for e_name, e_dates in reporter_dict['editions'].iteritems(): # e_name == "A. 2d" # e_dates == { # "end": "1938-12-31T00:00:00", # "start": "1885-01-01T00:00:00" # } for key in ['start', 'end']: is_date_or_none = ( isinstance(e_dates[key], datetime.datetime) or e_dates[key] is None ) self.assertTrue( is_date_or_none, msg=("%s dates in the reporter '%s' appear to be " "coming through as '%s'" % (key, e_name, type(e_dates[key]))) )
def test_all_required_keys_no_extra_keys(self): """Are all required keys present? Are there any keys present that shouldn't be? """ required_fields = [ 'cite_type', 'editions', 'mlz_jurisdiction', 'name', 'variations' ] optional_fields = ['publisher', 'notes', 'href'] all_fields = required_fields + optional_fields for reporter_abbv, reporter_list in REPORTERS.items(): for reporter_data in reporter_list: # All required fields present? for required_field in required_fields: try: reporter_data[required_field] except KeyError: self.fail("Reporter '%s' lacks required field '%s'" % (reporter_abbv, required_field)) # No extra fields? for k in reporter_data.keys(): self.assertIn( k, all_fields, "Reporter '%s' has an unknown field '%s'" % (reporter_abbv, k)) # No empty string values? for k, v in reporter_data.items(): if isinstance(v, str): self.assertTrue( v != "", msg="Field '%s' is empty in reporter '%s'" % (k, reporter_abbv))
def test_all_required_keys_no_extra_keys(self): """Are all required keys present? Are there any keys present that shouldn't be? """ required_fields = ['cite_type', 'editions', 'mlz_jurisdiction', 'name', 'variations'] optional_fields = ['publisher', 'notes', 'href'] all_fields = required_fields + optional_fields for reporter_abbv, reporter_list in REPORTERS.items(): for reporter_data in reporter_list: # All required fields present? for required_field in required_fields: try: reporter_data[required_field] except KeyError: self.fail("Reporter '%s' lacks required field '%s'" % ( reporter_abbv, required_field )) # No extra fields? for k in reporter_data.keys(): self.assertIn( k, all_fields, "Reporter '%s' has an unknown field '%s'" % ( reporter_abbv, k ) )
def test_all_reporters_have_valid_cite_type(self): """Do all reporters have valid cite_type values?""" for reporter_abbv, reporter_list in REPORTERS.items(): for reporter_data in reporter_list: self.assertIn( reporter_data['cite_type'], VALID_CITE_TYPES, "%s did not have a valid cite_type value" % reporter_abbv, )
def test_any_keys_missing_editions(self): """Have we added any new reporters that lack a matching edition?""" for r_name, r_items in REPORTERS.items(): # For each reporter for item in r_items: # and each book in each reporter self.assertIn( r_name, item['editions'], msg="Could not find edition for key: %s" % r_name )
def test_any_keys_missing_editions(self): """Have we added any new reporters that lack a matching edition?""" for r_name, r_items in REPORTERS.items(): # For each reporter for item in r_items: # and each book in each reporter self.assertIn(r_name, item['editions'], msg="Could not find edition for key: %s" % r_name)
def test_for_variations_mapping_to_bad_keys(self): """Do we have a variation that maps to a key that doesn't exist in the first place? """ for variations in VARIATIONS_ONLY.values(): for variation in variations: self.assertIn( EDITIONS[variation], REPORTERS.keys(), msg="Could not map variation to a valid reporter: %s" % variation )
def test_for_variations_mapping_to_bad_keys(self): """Do we have a variation that maps to a key that doesn't exist in the first place? """ for variations in VARIATIONS_ONLY.values(): for variation in variations: self.assertIn( EDITIONS[variation], REPORTERS.keys(), msg="Could not map variation to a valid reporter: %s" % variation)
def test_nothing_ends_before_it_starts(self): """Do any editions have end dates before their start dates?""" for reporter_dicts in REPORTERS.values(): # Each value is a list of reporter dictionaries for reporter in reporter_dicts: # Each edition is a dict of keys that go to more dicts! for k, edition in reporter['editions'].items(): if edition['start'] and edition['end']: self.assertLessEqual( edition['start'], edition['end'], msg="It appears that edition %s ends before it " "starts." % k)
def test_nothing_ends_before_it_starts(self): """Do any editions have end dates before their start dates?""" for reporter_dicts in REPORTERS.values(): # Each value is a list of reporter dictionaries for reporter in reporter_dicts: # Each edition is a dict of keys that go to more dicts! for k, edition in reporter['editions'].items(): if edition['start'] and edition['end']: self.assertLessEqual( edition['start'], edition['end'], msg="It appears that edition %s ends before it " "starts." % k )
def make_csv(): with open('reporters.csv', 'w') as f: out = csv.DictWriter(f, fieldnames=FIELDNAMES) out.writeheader() for cite, reporter_list in REPORTERS.items(): print("Adding: %s" % cite) for reporter in reporter_list: d = make_editions_dict(reporter['editions']) d['citation'] = cite d['name'] = reporter['name'] d['publisher'] = reporter.get('publisher', '') d['cite_type'] = reporter['cite_type'] d['mlz_jurisdictions'] = ", ".join(reporter['mlz_jurisdiction']) d['variations'] = ", ".join(reporter['variations'].keys()) d['href'] = reporter.get('href', '') d['notes'] = reporter.get('notes', '') out.writerow(d)
def make_csv(): with open('reporters.csv', 'w') as f: out = csv.DictWriter(f, fieldnames=FIELDNAMES) out.writeheader() for cite, reporter_list in REPORTERS.items(): print("Adding: %s" % cite) for reporter in reporter_list: d = make_editions_dict(reporter['editions']) d['citation'] = cite d['name'] = reporter['name'] d['publisher'] = reporter.get('publisher', '') d['cite_type'] = reporter['cite_type'] d['mlz_jurisdictions'] = ", ".join( reporter['mlz_jurisdiction']) d['variations'] = ", ".join(reporter['variations'].keys()) d['href'] = reporter.get('href', '') d['notes'] = reporter.get('notes', '') out.writerow(d)
def make_csv(): with open("reporters.csv", "w") as f: out = csv.DictWriter(f, fieldnames=FIELDNAMES) out.writeheader() for cite, reporter_list in REPORTERS.items(): print("Adding: %s" % cite) for reporter in reporter_list: d = make_editions_dict(reporter["editions"]) d["citation"] = cite d["name"] = reporter["name"] d["publisher"] = reporter.get("publisher", "") d["cite_type"] = reporter["cite_type"] d["mlz_jurisdictions"] = ", ".join( reporter["mlz_jurisdiction"]) d["variations"] = ", ".join(reporter["variations"].keys()) d["href"] = reporter.get("href", "") d["notes"] = reporter.get("notes", "") out.writerow(d)
def all_series(): for reporter_list in REPORTERS.itervalues(): for reporter in reporter_list: for k in reporter["editions"].keys( ) + reporter["variations"].keys(): yield k
def iter_reporters(): for reporter_abbv, reporter_list in REPORTERS.items(): for reporter_data in reporter_list: yield reporter_abbv, reporter_list, reporter_data
def aggregate_reporters(): makedirs(dest_dir) aggregate = {} # get map of reporter key to canonical name in FLP db flp_keys = {} for reporter_list in REPORTERS.itervalues(): for reporter in reporter_list: fields = [reporter['cite_type'], reporter['name']] for k in reporter["editions"].keys(): flp_keys[cite_to_key(k)] = fields + [k] for k, v in reporter["variations"].items(): flp_keys[cite_to_key(k)] = fields + [v] # get map of reporter key to name in Juris-M db juris_keys = {} for json_file, label in [[ '../lib/jurism-abbreviations/primary-us.json', 'primary' ], ['../lib/jurism-abbreviations/secondary-us-bluebook.json', 'secondary']]: data = json.load( open(os.path.join(os.path.dirname(__file__), json_file))) for juris in data["xdata"].itervalues(): for full_name, short_name in juris["container-title"].iteritems(): key = cite_to_key(short_name) if key not in juris_keys: juris_keys[key] = [label, short_name, full_name] # get map of reporter key to CAP reporter cap_keys = {} for reporter in json.load( open( os.path.join(os.path.dirname(__file__), '../lib/reporter-list/reporters.json'))): key = cite_to_key(reporter['short']) if key not in cap_keys: cap_keys[key] = [reporter['reporter'], reporter['short']] # aggregate rows in our collected citations for csv_path in tqdm(sorted(glob(os.path.join(source_dir, "*.csv")))): csvreader = csv.DictReader(open(csv_path)) for row in csvreader: key = cite_to_key(row['Series']) if key in aggregate: aggregate[key]['Count'] += int(row['Count']) else: row['Examples'] = ['', '', ''] row['Count'] = int(row['Count']) row['Series'] = key row['FLP'] = flp_keys.get(key, ['', '', '']) row['juris'] = juris_keys.get(key, ['', '', '']) row['CAP'] = cap_keys.get(key, ['', '']) aggregate[key] = row aggregate[key]['Examples'] = [ row['Example %s' % i] for i in [1, 2, 3] if row.get('Example %s' % i) ] + aggregate[key]['Examples'] # write to CSV out = [[k, v['Count']] + v['Examples'][:3] + v['CAP'] + v['FLP'] + v['juris'] for k, v in aggregate.iteritems() if v['Count'] >= 100] out.sort(key=lambda x: x[1], reverse=True) with open(os.path.join(dest_dir, 'aggregate.csv'), 'wb') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow([ 'Series', 'Count', 'Example 1', 'Example 2', 'Example 3', 'CAP Cite', 'CAP Full', 'FLP Type', 'FLP Name', 'FLP Cite', 'Juris-M Type', 'Juris-M Cite', 'Juris-M Full', ]) for row in out: csvwriter.writerow([unicode(s).encode("utf-8") for s in row])
def all_series(): for reporter_list in REPORTERS.itervalues(): for reporter in reporter_list: for k in reporter["editions"].keys() + reporter["variations"].keys(): yield k
def _populate_reporter_extractors(): """Populate EXTRACTORS and EDITIONS_LOOKUP.""" # Set up regex replacement variables from reporters-db raw_regex_variables = deepcopy(RAW_REGEX_VARIABLES) raw_regex_variables["full_cite"][""] = "$volume $reporter,? $page" raw_regex_variables["page"][""] = rf"(?P<page>{PAGE_NUMBER_REGEX})" regex_variables = process_variables(raw_regex_variables) def _substitute_edition(template, *edition_names): """Helper to replace $edition in template with edition_names.""" edition = "|".join(re.escape(e) for e in edition_names) return Template(template).safe_substitute(edition=edition) # Extractors step one: add an extractor for each reporter string # Build a lookup of regex -> edition. # Keys in this dict will be regular expressions to handle a # particular reporter string, like (simplified) # r"(?P<volume>\d+) (?P<reporter>U\.S\.) (?P<page>\d+)" editions_by_regex = defaultdict( # Values in this dict will be: lambda: { # Exact matches. If the regex is "\d+ U.S. \d+", # this will be [Edition("U.S.")] "editions": [], # Variants. If the regex matches "\d+ U. S. \d+", # this will be [Edition("U.S.")] "variations": [], # Strings a text must contain for this regex to match. # If the regex is "\d+ S.E. 2d \d+", # this will be {"S.E. 2d"} "strings": set(), # Whether this regex results in a short cite: "short": False, } ) def _add_regex( kind: str, reporters: List[str], edition: Edition, regex: str, ): """Helper to generate citations for a reporter and insert into editions_by_regex.""" for reporter in reporters: EDITIONS_LOOKUP[reporter].append(edition) editions_by_regex[regex][kind].append(edition) # add strings have_strings = re.escape(reporters[0]) in regex if have_strings: editions_by_regex[regex]["strings"].update(reporters) # add short cite short_cite_regex = short_cite_re(regex) if short_cite_regex != regex: editions_by_regex[short_cite_regex][kind].append(edition) editions_by_regex[short_cite_regex]["short"] = True if have_strings: editions_by_regex[short_cite_regex]["strings"].update( reporters ) def _add_regexes( regex_templates: List[str], edition_name: str, edition: Edition, variations: List[str], ): """Expand regex_templates and add to editions_by_regex.""" for regex_template in regex_templates: regex_template = recursive_substitute( regex_template, regex_variables ) regex = _substitute_edition(regex_template, edition_name) _add_regex("editions", [edition_name], edition, regex) if variations: regex = _substitute_edition(regex_template, *variations) _add_regex( "variations", variations, edition, regex, ) # add reporters.json: for source_key, source_cluster in REPORTERS.items(): for source in source_cluster: reporter_obj = Reporter( short_name=source_key, name=source["name"], cite_type=source["cite_type"], source="reporters", ) variations = source["variations"] for edition_name, edition_data in source["editions"].items(): edition = Edition( short_name=edition_name, reporter=reporter_obj, start=edition_data["start"], end=edition_data["end"], ) regex_templates = edition_data.get("regexes") or ["$full_cite"] edition_variations = [ k for k, v in variations.items() if v == edition_name ] _add_regexes( regex_templates, edition_name, edition, edition_variations ) # add laws.json for source_key, source_cluster in LAWS.items(): for source in source_cluster: reporter_obj = Reporter( short_name=source_key, name=source["name"], cite_type=source["cite_type"], source="laws", ) edition = Edition( short_name=source_key, reporter=reporter_obj, start=source["start"], end=source["end"], ) regex_templates = source.get("regexes") or ["$full_cite"] # handle citation to multiple sections, like # "Mass. Gen. Laws ch. 1, §§ 2-3": regex_templates = [ r.replace(r"§ ", r"§§? ?") for r in regex_templates ] _add_regexes( regex_templates, source_key, edition, source.get("variations", []), ) # add journals.json for source_key, source_cluster in JOURNALS.items(): for source in source_cluster: reporter_obj = Reporter( short_name=source_key, name=source["name"], cite_type=source["cite_type"], source="journals", ) edition = Edition( short_name=source_key, reporter=reporter_obj, start=source["start"], end=source["end"], ) regex_templates = source.get("regexes") or ["$full_cite"] _add_regexes( regex_templates, source_key, edition, source.get("variations", []), ) # Add each regex to EXTRACTORS: for regex, cluster in editions_by_regex.items(): EXTRACTORS.append( TokenExtractor( nonalphanum_boundaries_re(regex), CitationToken.from_match, extra={ "exact_editions": cluster["editions"], "variation_editions": cluster["variations"], "short": cluster["short"], }, strings=list(cluster["strings"]), ) ) # Extractors step two: # Add a few one-off extractors to handle special token types # other than citations: EXTRACTORS.extend( [ # Id. TokenExtractor( ID_REGEX, IdToken.from_match, flags=re.I, strings=["id.", "ibid."], ), # supra TokenExtractor( SUPRA_REGEX, SupraToken.from_match, flags=re.I, strings=["supra"], ), # paragraph TokenExtractor( PARAGRAPH_REGEX, ParagraphToken.from_match, ), # case name stopwords TokenExtractor( STOP_WORD_REGEX, StopWordToken.from_match, flags=re.I, strings=STOP_WORDS, ), # tokens containing section symbols TokenExtractor( SECTION_REGEX, SectionToken.from_match, strings=["§"] ), ] )
def disambiguate_reporters( citations: List[Union[Citation, NonopinionCitation]] ) -> List[Union[Citation, NonopinionCitation]]: """Convert a list of citations to a list of unambiguous ones. Goal is to figure out: - citation.canonical_reporter - citation.lookup_index And there are a few things that can be ambiguous: - More than one variation. - More than one reporter for the key. - Could be an edition (or not) - All combinations of the above: - More than one variation. - More than one variation, with more than one reporter for the key. - More than one variation, with more than one reporter for the key, which is an edition. - More than one variation, which is an edition - ... For variants, we just need to sort out the canonical_reporter. If it's not possible to disambiguate the reporter, we simply have to drop it. """ unambiguous_citations = [] for citation in citations: # Only disambiguate citations with a reporter if not isinstance(citation, (FullCitation, ShortformCitation)): unambiguous_citations.append(citation) continue # Non-variant items (P.R.R., A.2d, Wash., etc.) if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None: citation.canonical_reporter = EDITIONS[citation.reporter] if len(REPORTERS[EDITIONS[citation.reporter]]) == 1: # Single reporter, easy-peasy. citation.lookup_index = 0 unambiguous_citations.append(citation) continue # Multiple books under this key, but which is correct? if citation.year: # attempt resolution by date possible_citations = [] rep_len = len(REPORTERS[EDITIONS[citation.reporter]]) for i in range(0, rep_len): if is_date_in_reporter( REPORTERS[EDITIONS[citation.reporter]][i] ["editions"], citation.year, ): possible_citations.append((citation.reporter, i)) if len(possible_citations) == 1: # We were able to identify only one hit # after filtering by year. citation.reporter = possible_citations[0][0] citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue # Try doing a variation of an edition. elif VARIATIONS_ONLY.get(citation.reporter) is not None: if len(VARIATIONS_ONLY[citation.reporter]) == 1: # Only one variation -- great, use it. citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[ citation.reporter][0]] cached_variation = citation.reporter citation.reporter = VARIATIONS_ONLY[citation.reporter][0] if len(REPORTERS[citation.canonical_reporter]) == 1: # It's a single reporter under a misspelled key. citation.lookup_index = 0 unambiguous_citations.append(citation) continue # Multiple reporters under a single misspelled key # (e.g. Wn.2d --> Wash --> Va Reports, Wash or # Washington Reports). if citation.year: # attempt resolution by date possible_citations = [] rep_can = len(REPORTERS[citation.canonical_reporter]) for i in range(0, rep_can): if is_date_in_reporter( REPORTERS[citation.canonical_reporter][i] ["editions"], citation.year, ): possible_citations.append((citation.reporter, i)) if len(possible_citations) == 1: # We were able to identify only one hit after # filtering by year. citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue # Attempt resolution by unique variation # (e.g. Cr. can only be Cranch[0]) possible_citations = [] reps = REPORTERS[citation.canonical_reporter] for i in range(0, len(reps)): for variation in REPORTERS[citation.canonical_reporter][i][ "variations"].items(): if variation[0] == cached_variation: possible_citations.append((variation[1], i)) if len(possible_citations) == 1: # We were able to find a single match after filtering # by variation. citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue else: # Multiple variations, deal with them. possible_citations = [] for reporter_key in VARIATIONS_ONLY[citation.reporter]: for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])): # This inner loop works regardless of the number of # reporters under the key. key = REPORTERS[EDITIONS[reporter_key]] if citation.year: cite_year = citation.year if is_date_in_reporter(key[i]["editions"], cite_year): possible_citations.append((reporter_key, i)) if len(possible_citations) == 1: # We were able to identify only one hit after filtering by # year. citation.canonical_reporter = EDITIONS[ possible_citations[0][0]] citation.reporter = possible_citations[0][0] citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue return unambiguous_citations
def get_date_filed(clean_html_tree, citations, case_path=None, court=None): path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]' # Get a reasonable date range based on reporters in the citations. reporter_keys = [citation.reporter for citation in citations] range_dates = [] for reporter_key in reporter_keys: for reporter in REPORTERS.get(EDITIONS.get(reporter_key)): try: range_dates.extend(reporter["editions"][reporter_key]) except KeyError: # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0]. pass if range_dates: start, end = min(range_dates) - timedelta(weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52) if end > now(): end = now() dates = [] for e in clean_html_tree.xpath(path): text = tostring(e, method="text", encoding="unicode") # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge # the word at, and anything after it. text = re.sub(" at .*", "", text) # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format. text = re.sub("\d{5,}", "", text) # The parser can't handle 'Sept.' so we tweak it. text = text.replace("Sept.", "Sep.") # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08. re_match = re.search("\d{1,2}, \d{1,2}, \d{4}", text) if re_match: # These are always date argued, thus continue. continue # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12 # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/ re_match = re.search("\d{1,2}-\d{1,2}, \d{4}", text) if re_match: # These are always date argued, thus continue. continue # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this. if "denied" in text.lower(): continue try: if range_dates: found = parse_dates.parse_dates(text, sane_start=start, sane_end=end) else: found = parse_dates.parse_dates(text, sane_end=now()) if found: dates.extend(found) except UnicodeEncodeError: # If it has unicode is crashes dateutil's parser, but is unlikely to be the date. pass # Get the date from our SCOTUS date table scotus_dates_found = [] if not dates and court == "scotus": for citation in citations: try: # Scotus dates are in the form of a list, since a single citation can refer to several dates. found = scotus_dates["%s %s %s" % (citation.volume, citation.reporter, citation.page)] if len(found) == 1: scotus_dates_found.extend(found) except KeyError: pass if len(scotus_dates_found) == 1: dates = scotus_dates_found if not dates: # Try to grab the year from the citations, if it's the same in all of them. years = set([citation.year for citation in citations if citation.year]) if len(years) == 1: dates.append(datetime.datetime(list(years)[0], 1, 1)) if not dates: try: dates = fixes[case_path]["dates"] except KeyError: if "input_dates" in DEBUG: # subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate() print " No date found for: file://%s" % case_path input_date = raw_input(" What should be here (YYYY-MM-DD)? ") add_fix(case_path, {"dates": [datetime.datetime.strptime(input_date, "%Y-%m-%d")]}) dates = [datetime.datetime.strptime(input_date, "%Y-%m-%d")] if "log_bad_dates" in DEBUG: # Write the failed case out to file. with open("missing_dates.txt", "a") as out: out.write("%s\n" % case_path) if dates: if "date" in DEBUG: log_print(" Using date: %s of dates found: %s" % (max(dates), dates)) return max(dates) else: if "date" in DEBUG: log_print(" No dates found") return []
def get_editions(REPORTERS=None): """ Process REPORTERS into a lookup dict from reporter string or normalized reporter string to edition data. Given: >>> REPORTERS = { ... 'Foo': [ ... { ... 'editions': { ... 'Foo': {"start": datetime(1900, 1, 1), "end": datetime(1910, 1, 1)}, ... 'Foo 2d.': {"start": datetime(1910, 1, 1), "end": None}, ... }, ... 'variations': { ... 'foo': 'Foo', ... 'Food': 'Foo', ... } ... }, ... { ... 'editions': { ... 'Foo': {"start": datetime(1800, 1, 1), "end": datetime(1810, 1, 1)}, ... }, ... 'variations': { ... 'foo': 'Foo', ... 'Fool': 'Foo', ... } ... }, ... ] ... } Each reporter string maps to its possible resolutions, sorted in reverse-end-date order: >>> editions = get_editions(REPORTERS) >>> foo1 = {'reporter': 'Foo', 'start_year': 1900, 'end': datetime(1910, 1, 1)} >>> foo2d = {'reporter': 'Foo 2d.', 'start_year': 1910, 'end': datetime(9999, 1, 1)} >>> foo2 = {'reporter': 'Foo', 'start_year': 1800, 'end': datetime(1810, 1, 1)} >>> assert editions == { ... 'Foo': [foo1, foo2], ... 'foo': [foo1, foo2], ... 'Foo 2d.': [foo2d], ... 'foo2d': [foo2d], ... 'Food': [foo1], ... 'food': [foo1], ... 'Fool': [foo2], ... 'fool': [foo2], ... } """ if REPORTERS is None: from reporters_db import REPORTERS editions = defaultdict(list) not_ended_date = datetime(9999, 1, 1) unknown_start_date = datetime(1750, 1, 1) def append(k, v): for key in (k, normalize_cite(k)): if v not in editions[key]: editions[key].append(v) for reporter_cluster in REPORTERS.values(): for reporter in reporter_cluster: local_editions = {} for k, v in reporter["editions"].items(): local_editions[k] = edition = { 'reporter': k, 'start_year': 0 if v['start'] == unknown_start_date else v['start'].year, 'end': v['end'] or not_ended_date, } append(k, edition) for k, v in reporter["variations"].items(): append(k, local_editions[v]) # sort candidates for each string: first prefer exact matches, then editions that ended more recently for edition_key, candidates in editions.items(): candidates.sort(reverse=True, key=lambda c: (c['reporter'] == edition_key, c['end'])) return editions
def get_date_filed(clean_html_tree, citations, case_path=None, court=None): path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]' # Get a reasonable date range based on reporters in the citations. reporter_keys = [citation.reporter for citation in citations] range_dates = [] for reporter_key in reporter_keys: for reporter in REPORTERS.get(EDITIONS.get(reporter_key)): try: range_dates.extend(reporter['editions'][reporter_key]) except KeyError: # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0]. pass if range_dates: start, end = min(range_dates) - timedelta( weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52) if end > now(): end = now() dates = [] for e in clean_html_tree.xpath(path): text = tostring(e, method='text', encoding='unicode') # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge # the word at, and anything after it. text = re.sub(' at .*', '', text) # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format. text = re.sub('\d{5,}', '', text) # The parser can't handle 'Sept.' so we tweak it. text = text.replace('Sept.', 'Sep.') # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08. re_match = re.search('\d{1,2}, \d{1,2}, \d{4}', text) if re_match: # These are always date argued, thus continue. continue # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12 # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/ re_match = re.search('\d{1,2}-\d{1,2}, \d{4}', text) if re_match: # These are always date argued, thus continue. continue # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this. if 'denied' in text.lower(): continue try: if range_dates: found = parse_dates.parse_dates(text, sane_start=start, sane_end=end) else: found = parse_dates.parse_dates(text, sane_end=now()) if found: dates.extend(found) except UnicodeEncodeError: # If it has unicode is crashes dateutil's parser, but is unlikely to be the date. pass # Get the date from our SCOTUS date table scotus_dates_found = [] if not dates and court == 'scotus': for citation in citations: try: # Scotus dates are in the form of a list, since a single citation can refer to several dates. found = scotus_dates["%s %s %s" % (citation.volume, citation.reporter, citation.page)] if len(found) == 1: scotus_dates_found.extend(found) except KeyError: pass if len(scotus_dates_found) == 1: dates = scotus_dates_found if not dates: # Try to grab the year from the citations, if it's the same in all of them. years = set([citation.year for citation in citations if citation.year]) if len(years) == 1: dates.append(datetime.datetime(list(years)[0], 1, 1)) if not dates: try: dates = fixes[case_path]['dates'] except KeyError: if 'input_dates' in DEBUG: #subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate() print ' No date found for: file://%s' % case_path input_date = raw_input(' What should be here (YYYY-MM-DD)? ') add_fix(case_path, { 'dates': [datetime.datetime.strptime(input_date, '%Y-%m-%d')] }) dates = [datetime.datetime.strptime(input_date, '%Y-%m-%d')] if 'log_bad_dates' in DEBUG: # Write the failed case out to file. with open('missing_dates.txt', 'a') as out: out.write('%s\n' % case_path) if dates: if 'date' in DEBUG: log_print(" Using date: %s of dates found: %s" % (max(dates), dates)) return max(dates) else: if 'date' in DEBUG: log_print(" No dates found") return []
def disambiguate_reporters(citations): """Convert a list of citations to a list of unambiguous ones. Goal is to figure out: - citation.canonical_reporter - citation.lookup_index And there are a few things that can be ambiguous: - More than one variation. - More than one reporter for the key. - Could be an edition (or not) - All combinations of the above: - More than one variation. - More than one variation, with more than one reporter for the key. - More than one variation, with more than one reporter for the key, which is an edition. - More than one variation, which is an edition - ... For variants, we just need to sort out the canonical_reporter. If it's not possible to disambiguate the reporter, we simply have to drop it. """ unambiguous_citations = [] for citation in citations: # Non-variant items (P.R.R., A.2d, Wash., etc.) if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None: citation.canonical_reporter = EDITIONS[citation.reporter] if len(REPORTERS[EDITIONS[citation.reporter]]) == 1: # Single reporter, easy-peasy. citation.lookup_index = 0 unambiguous_citations.append(citation) continue else: # Multiple books under this key, but which is correct? if citation.year: # attempt resolution by date possible_citations = [] for i in range(0, len(REPORTERS[EDITIONS[citation.reporter]])): if is_date_in_reporter(REPORTERS[EDITIONS[citation.reporter]][i]["editions"], citation.year): possible_citations.append((citation.reporter, i)) if len(possible_citations) == 1: # We were able to identify only one hit after filtering by year. citation.reporter = possible_citations[0][0] citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue # Try doing a variation of an edition. elif VARIATIONS_ONLY.get(citation.reporter) is not None: if len(VARIATIONS_ONLY[citation.reporter]) == 1: # Only one variation -- great, use it. citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[citation.reporter][0]] cached_variation = citation.reporter citation.reporter = VARIATIONS_ONLY[citation.reporter][0] if len(REPORTERS[citation.canonical_reporter]) == 1: # It's a single reporter under a misspelled key. citation.lookup_index = 0 unambiguous_citations.append(citation) continue else: # Multiple reporters under a single misspelled key # (e.g. Wn.2d --> Wash --> Va Reports, Wash or # Washington Reports). if citation.year: # attempt resolution by date possible_citations = [] for i in range(0, len(REPORTERS[citation.canonical_reporter])): if is_date_in_reporter( REPORTERS[citation.canonical_reporter][i]["editions"], citation.year ): possible_citations.append((citation.reporter, i)) if len(possible_citations) == 1: # We were able to identify only one hit after filtering by year. citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue # Attempt resolution by unique variation (e.g. Cr. can only be Cranch[0]) possible_citations = [] for i in range(0, len(REPORTERS[citation.canonical_reporter])): for variation in REPORTERS[citation.canonical_reporter][i]["variations"].items(): if variation[0] == cached_variation: possible_citations.append((variation[1], i)) if len(possible_citations) == 1: # We were able to find a single match after filtering by variation. citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue else: # Multiple variations, deal with them. possible_citations = [] for reporter_key in VARIATIONS_ONLY[citation.reporter]: for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])): # This inner loop works regardless of the number of reporters under the key. if is_date_in_reporter(REPORTERS[EDITIONS[reporter_key]][i]["editions"], citation.year): possible_citations.append((reporter_key, i)) if len(possible_citations) == 1: # We were able to identify only one hit after filtering by year. citation.canonical_reporter = EDITIONS[possible_citations[0][0]] citation.reporter = possible_citations[0][0] citation.lookup_index = possible_citations[0][1] unambiguous_citations.append(citation) continue return unambiguous_citations