def test_find_tc_citations(self): """Can we parse tax court citations properly?""" # fmt: off test_pairs = ( # Test with atypical formatting for Tax Court Memos ('the 1 T.C. No. 233', [case_citation(page='233', reporter='T.C. No.')]), ('word T.C. Memo. 2019-233', [ case_citation('T.C. Memo. 2019-233', page='233', reporter='T.C. Memo.', volume='2019') ]), ('something T.C. Summary Opinion 2019-233', [ case_citation('T.C. Summary Opinion 2019-233', page='233', reporter='T.C. Summary Opinion', volume='2019') ]), ('T.C. Summary Opinion 2018-133', [ case_citation('T.C. Summary Opinion 2018-133', page='133', reporter='T.C. Summary Opinion', volume='2018') ]), ('U.S. 1234 1 U.S. 1', [case_citation(volume='1', reporter='U.S.', page='1')]), ) # fmt: on self.run_test_pairs(test_pairs, "Tax court citation extraction")
def test_comparison(self): """Are two citation objects equal when their attributes are the same?""" citations = [ case_citation(2, volume="2", reporter="U.S.", page="2"), case_citation(2, volume="2", reporter="U.S.", page="2"), ] print("Testing citation comparison...", end=" ") self.assertEqual(citations[0], citations[1]) self.assertEqual(hash(citations[0]), hash(citations[1])) print("✓")
def test_custom_tokenizer(self): extractors = [] for e in EXTRACTORS: e = copy(e) e.regex = e.regex.replace(r"\.", r"[.,]") if hasattr(e, "_compiled_regex"): del e._compiled_regex extractors.append(e) tokenizer = Tokenizer(extractors) # fmt: off test_pairs = [ ('1 U,S, 1', [case_citation(reporter_found='U,S,')]), ] # fmt: on self.run_test_pairs(test_pairs, "Custom tokenizer", tokenizers=[tokenizer])
def test_citation_resolution(self) -> None: """Tests whether different types of citations (i.e., full, short form, supra, id) resolve correctly to opinion matches. """ # Opinion fixture info: # pk=7 is mocked with name 'Foo v. Bar' and citation '1 U.S. 1' # pk=8 is mocked with name 'Qwerty v. Uiop' and citation '2 F.3d 2' # pk=9 is mocked with name 'Lorem v. Ipsum' and citation '1 U.S. 50' # pk=11 is mocked with name 'Abcdef v. Ipsum' and citation '1 U.S. 999' opinion7 = Opinion.objects.get(pk=7) opinion8 = Opinion.objects.get(pk=8) opinion9 = Opinion.objects.get(pk=9) opinion11 = Opinion.objects.get(pk=11) full7 = case_citation( volume="1", reporter="U.S.", page="1", canonical_reporter="U.S.", court="scotus", index=1, reporter_found="U.S.", ) full8 = case_citation( volume="2", reporter="F.3d", page="2", canonical_reporter="F.", court="ca1", index=1, reporter_found="F.3d", ) full9 = case_citation( volume="1", reporter="U.S.", page="50", canonical_reporter="U.S.", court="scotus", index=1, reporter_found="U.S.", ) full11 = case_citation( volume="1", reporter="U.S.", page="999", canonical_reporter="U.S.", court="scotus", index=1, reporter_found="U.S.", ) full_na = case_citation( volume="1", reporter="U.S.", page="99", canonical_reporter="U.S.", court="scotus", index=1, reporter_found="U.S.", ) supra7 = supra_citation(index=1, antecedent_guess="Bar", pin_cite="99", volume="1") supra9_or_11 = supra_citation(index=1, antecedent_guess="Ipsum", pin_cite="99", volume="1") short7 = case_citation( reporter="U.S.", page="99", volume="1", index=1, antecedent_guess="Bar,", short=True, ) short7_or_9_tiebreaker = case_citation( reporter="U.S.", page="99", volume="1", index=1, antecedent_guess="Bar", short=True, ) short7_or_9_bad_antecedent = case_citation( reporter="U.S.", page="99", volume="1", index=1, antecedent_guess="somethingwrong", short=True, ) short9_or_11_common_antecedent = case_citation( reporter="U.S.", page="99", volume="1", index=1, antecedent_guess="Ipsum", short=True, ) short_na = case_citation( reporter="F.3d", page="99", volume="1", index=1, antecedent_guess="somethingwrong", short=True, ) id = id_citation(index=1) non = nonopinion_citation(index=1, source_text="§99") test_pairs = [ # Simple test for matching a single, full citation ([full7], { opinion7: [full7] }), # Test matching multiple full citations to different documents ([full7, full8], { opinion7: [full7], opinion8: [full8] }), # Test matching an unmatchacble full citation ([full_na], { NO_MATCH_RESOURCE: [full_na] }), # Test resolving a supra citation ([full7, supra7], { opinion7: [full7, supra7] }), # Test resolving a supra citation when its antecedent guess matches # two possible candidates. We expect the supra citation to not # be matched. ( [full9, full11, supra9_or_11], { opinion9: [full9], opinion11: [full11] }, ), # Test resolving a supra citation when the previous citation # match failed. # We expect the supra citation to not be matched. ([full_na, supra7], { NO_MATCH_RESOURCE: [full_na] }), # Test resolving a short form citation with a meaningful antecedent ([full7, short7], { opinion7: [full7, short7] }), # Test resolving a short form citation when its reporter and # volume match two possible candidates. We expect its antecedent # guess to provide the correct tiebreaker. ( [full7, full9, short7_or_9_tiebreaker], { opinion7: [full7, short7_or_9_tiebreaker], opinion9: [full9] }, ), # Test resolving a short form citation when its reporter and # volume match two possible candidates, and when it lacks a # meaningful antecedent. # We expect the short form citation to not be matched. ( [full7, full9, short7_or_9_bad_antecedent], { opinion7: [full7], opinion9: [full9] }, ), # Test resolving a short form citation when its reporter and # volume match two possible candidates, and when its antecedent # guess also matches multiple possibilities. # We expect the short form citation to not be matched. ( [full9, full11, short9_or_11_common_antecedent], { opinion9: [full9], opinion11: [full11] }, ), # Test resolving a short form citation when its reporter and # volume are erroneous. # We expect the short form citation to not be matched. ([full7, short_na], { opinion7: [full7] }), # Test resolving a short form citation when the previous citation # match failed. # We expect the short form citation to not be matched. ([full_na, short7], { NO_MATCH_RESOURCE: [full_na] }), # Test resolving an Id. citation ([full7, id], { opinion7: [full7, id] }), # Test resolving an Id. citation when the previous citation match # failed because there is no clear antecedent. We expect the Id. # citation to also not be matched. ( [full7, short_na, id], { opinion7: [full7] }, ), # Test resolving an Id. citation when the previous citation match # failed because a normal full citation lookup returned nothing. # We expect the Id. citation to be matched to the # NO_MATCH_RESOURCE placeholder object. ( [full7, full_na, id], { opinion7: [full7], NO_MATCH_RESOURCE: [full_na, id] }, ), # Test resolving an Id. citation when the previous citation is to a # non-opinion document. Since we can't match those documents (yet), # we expect the Id. citation to also not be matched. ( [full7, non, id], { opinion7: [full7] }, ), # Test resolving an Id. citation when it is the first citation # found. Since there is nothing before it, we expect no matches to # be returned. ([id], {}), ] # fmt: on for citations, expected_resolutions in test_pairs: with self.subTest( f"Testing citation matching for {citations}...", citations=citations, expected_resolutions=expected_resolutions, ): # The citing opinion does not matter for this test citing_opinion = Opinion.objects.get(pk=1) citation_resolutions = do_resolve_citations( citations, citing_opinion) self.assertEqual( citation_resolutions, expected_resolutions, msg= f"\n{citation_resolutions}\n\n !=\n\n{expected_resolutions}", )
from scorched.response import SolrResponse from cl.custom_filters.templatetags.text_filters import best_case_name from cl.lib.scorched_utils import ExtraSolrInterface, ExtraSolrSearch from cl.lib.types import ( MatchedResourceType, SearchParam, SupportedCitationType, ) from cl.search.models import Opinion DEBUG = True QUERY_LENGTH = 10 NO_MATCH_RESOURCE = Resource(case_citation(0, source_text="UNMATCHED_CITATION")) def build_date_range(start_year: int, end_year: int) -> str: """Build a date range to be handed off to a solr query.""" start = datetime(start_year, 1, 1) end = datetime(end_year, 12, 31) date_range = f"[{start.isoformat()}Z TO {end.isoformat()}Z]" return date_range def make_name_param(defendant: str, plaintiff: str = None) -> Tuple[str, int]: """Remove punctuation and return cleaned string plus its length in tokens.""" token_list = defendant.split() if plaintiff: token_list.extend(plaintiff.split())
def test_disambiguate_citations(self): # fmt: off test_pairs = [ # 1. P.R.R --> Correct abbreviation for a reporter. ('1 P.R.R. 1', [case_citation(reporter='P.R.R.')]), # 2. U. S. --> A simple variant to resolve. ('1 U. S. 1', [case_citation(reporter_found='U. S.')]), # 3. A.2d --> Not a variant, but needs to be looked up in the # EDITIONS variable. ('1 A.2d 1', [case_citation(reporter='A.2d')]), # 4. A. 2d --> An unambiguous variant of an edition ('1 A. 2d 1', [case_citation(reporter='A.2d', reporter_found='A. 2d')]), # 5. P.R. --> A variant of 'Pen. & W.', 'P.R.R.', or 'P.' that's # resolvable by year ( '1 P.R. 1 (1831)', # Of the three, only Pen & W. was being published this year. [ case_citation(reporter='Pen. & W.', year=1831, reporter_found='P.R.') ]), # 5.1: W.2d --> A variant of an edition that either resolves to # 'Wis. 2d' or 'Wash. 2d' and is resolvable by year. ( '1 W.2d 1 (1854)', # Of the two, only Wis. 2d was being published this year. [ case_citation(reporter='Wis. 2d', year=1854, reporter_found='W.2d') ]), # 5.2: Wash. --> A non-variant that has more than one reporter for # the key, but is resolvable by year ('1 Wash. 1 (1890)', [case_citation(reporter='Wash.', year=1890)]), # 6. Cr. --> A variant of Cranch, which is ambiguous, except with # paired with this variation. ('1 Cra. 1', [ case_citation(reporter='Cranch', reporter_found='Cra.', metadata={'court': 'scotus'}) ]), # 7. Cranch. --> Not a variant, but could refer to either Cranch's # Supreme Court cases or his DC ones. In this case, we cannot # disambiguate. Years are not known, and we have no further # clues. We must simply drop Cranch from the results. ('1 Cranch 1 1 U.S. 23', [case_citation(page='23')]), # 8. Unsolved problem. In theory, we could use parallel citations # to resolve this, because Rob is getting cited next to La., but # we don't currently know the proximity of citations to each # other, so can't use this. # - Rob. --> Either: # 8.1: A variant of Robards (1862-1865) or # 8.2: Robinson's Louisiana Reports (1841-1846) or # 8.3: Robinson's Virgina Reports (1842-1865) # ('1 Rob. 1 1 La. 1', # [case_citation(volume='1', reporter='Rob.', page='1'), # case_citation(volume='1', reporter='La.', page='1')]), # 9. Johnson #1 should pass and identify the citation ('1 Johnson 1 (1890)', [ case_citation( reporter='N.M. (J.)', reporter_found='Johnson', year=1890, ) ]), # 10. Johnson #2 should fail to disambiguate with year alone ('1 Johnson 1 (1806)', []), ] # fmt: on # all tests in this suite require disambiguation: test_pairs = [ pair + ({ "remove_ambiguous": True }, ) for pair in test_pairs ] self.run_test_pairs(test_pairs, "Disambiguation")
def test_find_citations(self): """Can we find and make citation objects from strings?""" # fmt: off test_pairs = ( # Basic test ('1 U.S. 1', [case_citation()]), # Basic test with a line break ('1 U.S.\n1', [case_citation()], { 'clean': ['all_whitespace'] }), # Basic test with a line break within a reporter ('1 U.\nS. 1', [case_citation(reporter_found='U. S.')], { 'clean': ['all_whitespace'] }), # Basic test of non-case name before citation (should not be found) ('lissner test 1 U.S. 1', [case_citation()]), # Test with plaintiff and defendant ('lissner v. test 1 U.S. 1', [ case_citation(metadata={ 'plaintiff': 'lissner', 'defendant': 'test' }) ]), # Test with plaintiff, defendant and year ('lissner v. test 1 U.S. 1 (1982)', [ case_citation(metadata={ 'plaintiff': 'lissner', 'defendant': 'test' }, year=1982) ]), # Don't choke on misformatted year ('lissner v. test 1 U.S. 1 (198⁴)', [ case_citation(metadata={ 'plaintiff': 'lissner', 'defendant': 'test' }) ]), # Test with different reporter than all of above. ('bob lissner v. test 1 F.2d 1 (1982)', [ case_citation(reporter='F.2d', year=1982, metadata={ 'plaintiff': 'lissner', 'defendant': 'test' }) ]), # Test with comma after defendant's name ('lissner v. test, 1 U.S. 1 (1982)', [ case_citation(metadata={ 'plaintiff': 'lissner', 'defendant': 'test' }, year=1982) ]), # Test with court and extra information ('bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)', [ case_citation(page='12', year=1982, metadata={ 'plaintiff': 'lissner', 'defendant': 'test', 'court': 'ca4', 'pin_cite': '347-348' }) ]), # Parallel cite with parenthetical ('bob lissner v. test 1 U.S. 12, 347-348, 1 S. Ct. 2, 358 (4th Cir. 1982) (overruling foo)', [ case_citation(page='12', year=1982, metadata={ 'plaintiff': 'lissner', 'defendant': 'test', 'court': 'ca4', 'pin_cite': '347-348', 'extra': "1 S. Ct. 2, 358", 'parenthetical': 'overruling foo' }), case_citation(page='2', reporter='S. Ct.', year=1982, metadata={ 'plaintiff': 'lissner', 'defendant': 'test 1 U.S. 12, 347-348', 'court': 'ca4', 'pin_cite': '358', 'parenthetical': 'overruling foo' }), ]), # Test full citation with nested parenthetical ('lissner v. test 1 U.S. 1 (1982) (discussing abc (Holmes, J., concurring))', [ case_citation(metadata={ 'plaintiff': 'lissner', 'defendant': 'test', 'parenthetical': 'discussing abc (Holmes, J., concurring)' }, year=1982) ]), # Test full citation with parenthetical and subsequent unrelated parenthetical ('lissner v. test 1 U.S. 1 (1982) (discussing abc); blah (something).', [ case_citation(metadata={ 'plaintiff': 'lissner', 'defendant': 'test', 'parenthetical': 'discussing abc' }, year=1982) ]), # Test with text before and after and a variant reporter ('asfd 22 U. S. 332 (1975) asdf', [ case_citation(page='332', volume='22', reporter_found='U. S.', year=1975) ]), # Test with finding reporter when it's a second edition ('asdf 22 A.2d 332 asdf', [case_citation(page='332', reporter='A.2d', volume='22')]), # Test if reporter in string will find proper citation string ('A.2d 332 11 A.2d 333', [case_citation(page='333', reporter='A.2d', volume='11')]), # Test finding a variant second edition reporter ('asdf 22 A. 2d 332 asdf', [ case_citation(page='332', reporter='A.2d', volume='22', reporter_found='A. 2d') ]), # Test finding a variant of an edition resolvable by variant alone. ('171 Wn.2d 1016', [ case_citation(page='1016', reporter='Wash. 2d', volume='171', reporter_found='Wn.2d') ]), # Test finding two citations where one of them has abutting # punctuation. ('2 U.S. 3, 4-5 (3 Atl. 33)', [ case_citation(page='3', volume='2', metadata={'pin_cite': '4-5'}), case_citation(page='33', reporter="A.", volume='3', reporter_found="Atl.") ]), # Test with the page number as a Roman numeral ('12 Neb. App. lxiv (2004)', [ case_citation(page='lxiv', reporter='Neb. Ct. App.', volume='12', reporter_found='Neb. App.', year=2004) ]), # Test with page range with a weird suffix ('559 N.W.2d 826|N.D.', [case_citation(page='826', reporter='N.W.2d', volume='559')]), # Test with malformed/missing page number ('1 U.S. f24601', []), # Test with the 'digit-REPORTER-digit' corner-case formatting ('2007-NMCERT-008', [ case_citation(source_text='2007-NMCERT-008', page='008', reporter='NMCERT', volume='2007') ]), ('2006-Ohio-2095', [ case_citation(source_text='2006-Ohio-2095', page='2095', reporter='Ohio', volume='2006') ]), ('2017 IL App (4th) 160407', [ case_citation(page='160407', reporter='IL App (4th)', volume='2017') ]), ('2017 IL App (1st) 143684-B', [ case_citation(page='143684-B', reporter='IL App (1st)', volume='2017') ]), # Test first kind of short form citation (meaningless antecedent) ('before asdf 1 U. S., at 2', [ case_citation(page='2', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf'}) ]), # Test second kind of short form citation (meaningful antecedent) ('before asdf, 1 U. S., at 2', [ case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf'}) ]), # Test short form citation with preceding ASCII quotation ('before asdf,” 1 U. S., at 2', [case_citation(page='2', reporter_found='U. S.', short=True)]), # Test short form citation when case name looks like a reporter ('before Johnson, 1 U. S., at 2', [ case_citation(page='2', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'Johnson'}) ]), # Test short form citation with no comma after reporter ('before asdf, 1 U. S. at 2', [ case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf'}) ]), # Test short form citation at end of document (issue #1171) ('before asdf, 1 U. S. end', []), # Test supra citation across line break ('before asdf, supra,\nat 2', [ supra_citation("supra,", metadata={ 'pin_cite': 'at 2', 'antecedent_guess': 'asdf' }) ], { 'clean': ['all_whitespace'] }), # Test short form citation with a page range ('before asdf, 1 U. S., at 20-25', [ case_citation(page='20', reporter_found='U. S.', short=True, metadata={ 'pin_cite': '20-25', 'antecedent_guess': 'asdf' }) ]), # Test short form citation with a page range with weird suffix ('before asdf, 1 U. S., at 20-25\\& n. 4', [ case_citation(page='20', reporter_found='U. S.', short=True, metadata={ 'pin_cite': '20-25', 'antecedent_guess': 'asdf' }) ]), # Test short form citation with a parenthetical ('before asdf, 1 U. S., at 2 (overruling xyz)', [ case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={ 'antecedent_guess': 'asdf', 'parenthetical': 'overruling xyz' }) ]), # Test short form citation with no space before parenthetical ('before asdf, 1 U. S., at 2(overruling xyz)', [ case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={ 'antecedent_guess': 'asdf', 'parenthetical': 'overruling xyz' }) ]), # Test short form citation with nested parentheticals ('before asdf, 1 U. S., at 2 (discussing xyz (Holmes, J., concurring))', [ case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={ 'antecedent_guess': 'asdf', 'parenthetical': 'discussing xyz (Holmes, J., concurring)' }) ]), # Test that short form citation doesn't treat year as parenthetical ('before asdf, 1 U. S., at 2 (2016)', [ case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf'}) ]), # Test short form citation with page range and parenthetical ('before asdf, 1 U. S., at 20-25 (overruling xyz)', [ case_citation(page='20', reporter='U.S.', reporter_found='U. S.', short=True, metadata={ 'antecedent_guess': 'asdf', 'pin_cite': '20-25', 'parenthetical': 'overruling xyz' }) ]), # Test short form citation with subsequent unrelated parenthetical ('asdf, 1 U. S., at 4 (discussing abc). Some other nonsense (clarifying nonsense)', [ case_citation(page='4', reporter='U.S.', reporter_found='U. S.', short=True, metadata={ 'antecedent_guess': 'asdf', 'parenthetical': 'discussing abc' }) ]), # Test short form citation generated from non-standard regex for full cite ('1 Mich. at 1', [case_citation(reporter='Mich.', short=True)]), # Test parenthetical matching with multiple citations ('1 U. S., at 2. foo v. bar 3 U. S. 4 (2010) (overruling xyz).', [ case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, volume='1', metadata={'pin_cite': '2'}), case_citation(page='4', reporter='U.S.', reporter_found='U. S.', short=False, year=2010, volume='3', metadata={ 'parenthetical': 'overruling xyz', 'plaintiff': 'foo', 'defendant': 'bar' }) ]), # Test with multiple citations and parentheticals ('1 U. S., at 2 (criticizing xyz). foo v. bar 3 U. S. 4 (2010) (overruling xyz).', [ case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, volume='1', metadata={ 'pin_cite': '2', 'parenthetical': 'criticizing xyz' }), case_citation(page='4', reporter='U.S.', reporter_found='U. S.', short=False, year=2010, volume='3', metadata={ 'parenthetical': 'overruling xyz', 'plaintiff': 'foo', 'defendant': 'bar' }) ]), # Test first kind of supra citation (standard kind) ('before asdf, supra, at 2', [ supra_citation("supra,", metadata={ 'pin_cite': 'at 2', 'antecedent_guess': 'asdf' }) ]), # Test second kind of supra citation (with volume) ('before asdf, 123 supra, at 2', [ supra_citation("supra,", metadata={ 'pin_cite': 'at 2', 'volume': '123', 'antecedent_guess': 'asdf' }) ]), # Test third kind of supra citation (sans page) ('before asdf, supra, foo bar', [supra_citation("supra,", metadata={'antecedent_guess': 'asdf'})]), # Test third kind of supra citation (with period) ('before asdf, supra. foo bar', [supra_citation("supra,", metadata={'antecedent_guess': 'asdf'})]), # Test supra citation at end of document (issue #1171) ('before asdf, supra end', [supra_citation("supra,", metadata={'antecedent_guess': 'asdf'})]), # Supra with parenthetical ('Foo, supra (overruling ...) (ignore this)', [ supra_citation("supra", metadata={ 'antecedent_guess': 'Foo', 'parenthetical': 'overruling ...' }) ]), ('Foo, supra, at 2 (overruling ...)', [ supra_citation("supra", metadata={ 'antecedent_guess': 'Foo', 'pin_cite': 'at 2', 'parenthetical': 'overruling ...' }) ]), # Test Ibid. citation ('foo v. bar 1 U.S. 12. asdf. Ibid. foo bar lorem ipsum.', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar' }), id_citation('Ibid.') ]), # Test italicized Ibid. citation ('<p>before asdf. <i>Ibid.</i></p> <p>foo bar lorem</p>', [id_citation('Ibid.')], { 'clean': ['html', 'inline_whitespace'] }), # Test Id. citation ('foo v. bar 1 U.S. 12, 347-348. asdf. Id., at 123. foo bar', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348' }), id_citation('Id.,', metadata={'pin_cite': 'at 123'}) ]), # Test Id. citation across line break ('foo v. bar 1 U.S. 12, 347-348. asdf. Id.,\nat 123. foo bar', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348' }), id_citation('Id.,', metadata={'pin_cite': 'at 123'}) ], { 'clean': ['all_whitespace'] }), # Test italicized Id. citation ('<p>before asdf. <i>Id.,</i> at 123.</p> <p>foo bar</p>', [id_citation('Id.,', metadata={'pin_cite': 'at 123'})], { 'clean': ['html', 'inline_whitespace'] }), # Test italicized Id. citation with another HTML tag in the way ('<p>before asdf. <i>Id.,</i> at <b>123.</b></p> <p>foo bar</p>', [id_citation('Id.,', metadata={'pin_cite': 'at 123'})], { 'clean': ['html', 'inline_whitespace'] }), # Test weirder Id. citations (#1344) ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. ¶ 34. foo bar', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348' }), id_citation('Id.', metadata={'pin_cite': '¶ 34'}) ]), ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. at 62-63, 67-68. f b', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348' }), id_citation('Id.', metadata={'pin_cite': 'at 62-63, 67-68'}) ]), ('foo v. bar 1 U.S. 12, 347-348. asdf. Id., at *10. foo bar', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348' }), id_citation('Id.,', metadata={'pin_cite': 'at *10'}) ]), ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. at 7-9, ¶¶ 38-53. f b', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348' }), id_citation('Id.', metadata={'pin_cite': 'at 7-9, ¶¶ 38-53'}) ]), ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. at pp. 45, 64. foo bar', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348' }), id_citation('Id.', metadata={'pin_cite': 'at pp. 45, 64'}) ]), ('foo v. bar 1 U.S. 12, 347-348. asdf. id. 119:12-14. foo bar', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348' }), id_citation('id.', metadata={'pin_cite': '119:12-14'}) ]), # Test Id. citation without page number ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. No page number.', [ case_citation(page='12', metadata={ 'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348' }), id_citation('Id.') ]), # Id. with parenthetical ('Id. (overruling ...) (ignore this)', [ id_citation("Id.", metadata={'parenthetical': 'overruling ...'}) ]), ('Id. at 2 (overruling ...)', [ id_citation("Id.", metadata={ 'pin_cite': 'at 2', 'parenthetical': 'overruling ...' }) ]), # Test non-opinion citation ('lorem ipsum see §99 of the U.S. code.', [nonopinion_citation('§99')]), # Test address that's not a citation (#1338) ( 'lorem 111 S.W. 12th St.', [], ), ( 'lorem 111 N. W. 12th St.', [], ), # Test Conn. Super. Ct. regex variation. ('Failed to recognize 1993 Conn. Super. Ct. 5243-P', [ case_citation(volume='1993', reporter='Conn. Super. Ct.', page='5243-P') ]), # Test that the tokenizer handles commas after a reporter. In the # past, " U. S. " would match but not " U. S., " ('foo 1 U.S., 1 bar', [case_citation()]), # Test reporter with custom regex ('blah blah Bankr. L. Rep. (CCH) P12,345. blah blah', [ case_citation(volume=None, reporter='Bankr. L. Rep.', reporter_found='Bankr. L. Rep. (CCH)', page='12,345') ]), ('blah blah, 2009 12345 (La.App. 1 Cir. 05/10/10). blah blah', [ case_citation(volume='2009', reporter='La.App. 1 Cir.', page='12345', groups={'date_filed': '05/10/10'}) ]), # Token scanning edge case -- incomplete paren at end of input ('1 U.S. 1 (', [case_citation()]), # Token scanning edge case -- missing plaintiff name at start of input ('v. Bar, 1 U.S. 1', [case_citation(metadata={'defendant': 'Bar'})]), # Token scanning edge case -- short form start of input ('1 U.S., at 1', [case_citation(short=True)]), (', 1 U.S., at 1', [case_citation(short=True)]), # Token scanning edge case -- supra at start of input ('supra.', [supra_citation("supra.")]), (', supra.', [supra_citation("supra.")]), ('123 supra.', [supra_citation("supra.", metadata={'volume': "123"})]), # Token scanning edge case -- Id. at end of input ('Id.', [id_citation('Id.,')]), ('Id. at 1.', [id_citation('Id.,', metadata={'pin_cite': 'at 1'})]), ('Id. foo', [id_citation('Id.,')]), # Reject citations that are part of larger words ( 'foo1 U.S. 1, 1. U.S. 1foo', [], ), # Long pin cite -- make sure no catastrophic backtracking in regex ('1 U.S. 1, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2286, 2287, 2288, 2289, 2290, 2291', [ case_citation( metadata={ 'pin_cite': '2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2286, 2287, 2288, 2289, 2290, 2291' }) ]), ) # fmt: on self.run_test_pairs(test_pairs, "Citation extraction")
from cl.custom_filters.templatetags.text_filters import best_case_name from cl.lib.scorched_utils import ExtraSolrInterface, ExtraSolrSearch from cl.lib.types import ( MatchedResourceType, SearchParam, SupportedCitationType, ) from cl.search.models import Opinion DEBUG = True QUERY_LENGTH = 10 NO_MATCH_RESOURCE = Resource( case_citation(0, source_text="UNMATCHED_CITATION") ) def build_date_range(start_year: int, end_year: int) -> str: """Build a date range to be handed off to a solr query.""" start = datetime(start_year, 1, 1) end = datetime(end_year, 12, 31) date_range = "[%sZ TO %sZ]" % (start.isoformat(), end.isoformat()) return date_range def make_name_param(defendant: str, plaintiff: str = None) -> Tuple[str, int]: """Remove punctuation and return cleaned string plus its length in tokens.""" token_list = defendant.split() if plaintiff: