示例#1
0
 def test_find_tc_citations(self):
     """Can we parse tax court citations properly?"""
     # fmt: off
     test_pairs = (
         # Test with atypical formatting for Tax Court Memos
         ('the 1 T.C. No. 233',
          [case_citation(page='233', reporter='T.C. No.')]),
         ('word T.C. Memo. 2019-233', [
             case_citation('T.C. Memo. 2019-233',
                           page='233',
                           reporter='T.C. Memo.',
                           volume='2019')
         ]),
         ('something T.C. Summary Opinion 2019-233', [
             case_citation('T.C. Summary Opinion 2019-233',
                           page='233',
                           reporter='T.C. Summary Opinion',
                           volume='2019')
         ]),
         ('T.C. Summary Opinion 2018-133', [
             case_citation('T.C. Summary Opinion 2018-133',
                           page='133',
                           reporter='T.C. Summary Opinion',
                           volume='2018')
         ]),
         ('U.S. 1234 1 U.S. 1',
          [case_citation(volume='1', reporter='U.S.', page='1')]),
     )
     # fmt: on
     self.run_test_pairs(test_pairs, "Tax court citation extraction")
示例#2
0
 def test_comparison(self):
     """Are two citation objects equal when their attributes are
     the same?"""
     citations = [
         case_citation(2, volume="2", reporter="U.S.", page="2"),
         case_citation(2, volume="2", reporter="U.S.", page="2"),
     ]
     print("Testing citation comparison...", end=" ")
     self.assertEqual(citations[0], citations[1])
     self.assertEqual(hash(citations[0]), hash(citations[1]))
     print("✓")
示例#3
0
    def test_custom_tokenizer(self):
        extractors = []
        for e in EXTRACTORS:
            e = copy(e)
            e.regex = e.regex.replace(r"\.", r"[.,]")
            if hasattr(e, "_compiled_regex"):
                del e._compiled_regex
            extractors.append(e)
        tokenizer = Tokenizer(extractors)

        # fmt: off
        test_pairs = [
            ('1 U,S, 1', [case_citation(reporter_found='U,S,')]),
        ]
        # fmt: on
        self.run_test_pairs(test_pairs,
                            "Custom tokenizer",
                            tokenizers=[tokenizer])
示例#4
0
    def test_citation_resolution(self) -> None:
        """Tests whether different types of citations (i.e., full, short form,
        supra, id) resolve correctly to opinion matches.
        """
        # Opinion fixture info:
        # pk=7 is mocked with name 'Foo v. Bar' and citation '1 U.S. 1'
        # pk=8 is mocked with name 'Qwerty v. Uiop' and citation '2 F.3d 2'
        # pk=9 is mocked with name 'Lorem v. Ipsum' and citation '1 U.S. 50'
        # pk=11 is mocked with name 'Abcdef v. Ipsum' and citation '1 U.S. 999'
        opinion7 = Opinion.objects.get(pk=7)
        opinion8 = Opinion.objects.get(pk=8)
        opinion9 = Opinion.objects.get(pk=9)
        opinion11 = Opinion.objects.get(pk=11)

        full7 = case_citation(
            volume="1",
            reporter="U.S.",
            page="1",
            canonical_reporter="U.S.",
            court="scotus",
            index=1,
            reporter_found="U.S.",
        )
        full8 = case_citation(
            volume="2",
            reporter="F.3d",
            page="2",
            canonical_reporter="F.",
            court="ca1",
            index=1,
            reporter_found="F.3d",
        )
        full9 = case_citation(
            volume="1",
            reporter="U.S.",
            page="50",
            canonical_reporter="U.S.",
            court="scotus",
            index=1,
            reporter_found="U.S.",
        )
        full11 = case_citation(
            volume="1",
            reporter="U.S.",
            page="999",
            canonical_reporter="U.S.",
            court="scotus",
            index=1,
            reporter_found="U.S.",
        )
        full_na = case_citation(
            volume="1",
            reporter="U.S.",
            page="99",
            canonical_reporter="U.S.",
            court="scotus",
            index=1,
            reporter_found="U.S.",
        )

        supra7 = supra_citation(index=1,
                                antecedent_guess="Bar",
                                pin_cite="99",
                                volume="1")
        supra9_or_11 = supra_citation(index=1,
                                      antecedent_guess="Ipsum",
                                      pin_cite="99",
                                      volume="1")

        short7 = case_citation(
            reporter="U.S.",
            page="99",
            volume="1",
            index=1,
            antecedent_guess="Bar,",
            short=True,
        )
        short7_or_9_tiebreaker = case_citation(
            reporter="U.S.",
            page="99",
            volume="1",
            index=1,
            antecedent_guess="Bar",
            short=True,
        )
        short7_or_9_bad_antecedent = case_citation(
            reporter="U.S.",
            page="99",
            volume="1",
            index=1,
            antecedent_guess="somethingwrong",
            short=True,
        )
        short9_or_11_common_antecedent = case_citation(
            reporter="U.S.",
            page="99",
            volume="1",
            index=1,
            antecedent_guess="Ipsum",
            short=True,
        )
        short_na = case_citation(
            reporter="F.3d",
            page="99",
            volume="1",
            index=1,
            antecedent_guess="somethingwrong",
            short=True,
        )

        id = id_citation(index=1)
        non = nonopinion_citation(index=1, source_text="§99")

        test_pairs = [
            # Simple test for matching a single, full citation
            ([full7], {
                opinion7: [full7]
            }),
            # Test matching multiple full citations to different documents
            ([full7, full8], {
                opinion7: [full7],
                opinion8: [full8]
            }),
            # Test matching an unmatchacble full citation
            ([full_na], {
                NO_MATCH_RESOURCE: [full_na]
            }),
            # Test resolving a supra citation
            ([full7, supra7], {
                opinion7: [full7, supra7]
            }),
            # Test resolving a supra citation when its antecedent guess matches
            # two possible candidates. We expect the supra citation to not
            # be matched.
            (
                [full9, full11, supra9_or_11],
                {
                    opinion9: [full9],
                    opinion11: [full11]
                },
            ),
            # Test resolving a supra citation when the previous citation
            # match failed.
            # We expect the supra citation to not be matched.
            ([full_na, supra7], {
                NO_MATCH_RESOURCE: [full_na]
            }),
            # Test resolving a short form citation with a meaningful antecedent
            ([full7, short7], {
                opinion7: [full7, short7]
            }),
            # Test resolving a short form citation when its reporter and
            # volume match two possible candidates. We expect its antecedent
            # guess to provide the correct tiebreaker.
            (
                [full7, full9, short7_or_9_tiebreaker],
                {
                    opinion7: [full7, short7_or_9_tiebreaker],
                    opinion9: [full9]
                },
            ),
            # Test resolving a short form citation when its reporter and
            # volume match two possible candidates, and when it lacks a
            # meaningful antecedent.
            # We expect the short form citation to not be matched.
            (
                [full7, full9, short7_or_9_bad_antecedent],
                {
                    opinion7: [full7],
                    opinion9: [full9]
                },
            ),
            # Test resolving a short form citation when its reporter and
            # volume match two possible candidates, and when its antecedent
            # guess also matches multiple possibilities.
            # We expect the short form citation to not be matched.
            (
                [full9, full11, short9_or_11_common_antecedent],
                {
                    opinion9: [full9],
                    opinion11: [full11]
                },
            ),
            # Test resolving a short form citation when its reporter and
            # volume are erroneous.
            # We expect the short form citation to not be matched.
            ([full7, short_na], {
                opinion7: [full7]
            }),
            # Test resolving a short form citation when the previous citation
            # match failed.
            # We expect the short form citation to not be matched.
            ([full_na, short7], {
                NO_MATCH_RESOURCE: [full_na]
            }),
            # Test resolving an Id. citation
            ([full7, id], {
                opinion7: [full7, id]
            }),
            # Test resolving an Id. citation when the previous citation match
            # failed because there is no clear antecedent. We expect the Id.
            # citation to also not be matched.
            (
                [full7, short_na, id],
                {
                    opinion7: [full7]
                },
            ),
            # Test resolving an Id. citation when the previous citation match
            # failed because a normal full citation lookup returned nothing.
            # We expect the Id. citation to be matched to the
            # NO_MATCH_RESOURCE placeholder object.
            (
                [full7, full_na, id],
                {
                    opinion7: [full7],
                    NO_MATCH_RESOURCE: [full_na, id]
                },
            ),
            # Test resolving an Id. citation when the previous citation is to a
            # non-opinion document. Since we can't match those documents (yet),
            # we expect the Id. citation to also not be matched.
            (
                [full7, non, id],
                {
                    opinion7: [full7]
                },
            ),
            # Test resolving an Id. citation when it is the first citation
            # found. Since there is nothing before it, we expect no matches to
            # be returned.
            ([id], {}),
        ]

        # fmt: on
        for citations, expected_resolutions in test_pairs:
            with self.subTest(
                    f"Testing citation matching for {citations}...",
                    citations=citations,
                    expected_resolutions=expected_resolutions,
            ):
                # The citing opinion does not matter for this test
                citing_opinion = Opinion.objects.get(pk=1)

                citation_resolutions = do_resolve_citations(
                    citations, citing_opinion)

                self.assertEqual(
                    citation_resolutions,
                    expected_resolutions,
                    msg=
                    f"\n{citation_resolutions}\n\n    !=\n\n{expected_resolutions}",
                )
示例#5
0
from scorched.response import SolrResponse

from cl.custom_filters.templatetags.text_filters import best_case_name
from cl.lib.scorched_utils import ExtraSolrInterface, ExtraSolrSearch
from cl.lib.types import (
    MatchedResourceType,
    SearchParam,
    SupportedCitationType,
)
from cl.search.models import Opinion

DEBUG = True

QUERY_LENGTH = 10

NO_MATCH_RESOURCE = Resource(case_citation(0,
                                           source_text="UNMATCHED_CITATION"))


def build_date_range(start_year: int, end_year: int) -> str:
    """Build a date range to be handed off to a solr query."""
    start = datetime(start_year, 1, 1)
    end = datetime(end_year, 12, 31)
    date_range = f"[{start.isoformat()}Z TO {end.isoformat()}Z]"
    return date_range


def make_name_param(defendant: str, plaintiff: str = None) -> Tuple[str, int]:
    """Remove punctuation and return cleaned string plus its length in tokens."""
    token_list = defendant.split()
    if plaintiff:
        token_list.extend(plaintiff.split())
示例#6
0
 def test_disambiguate_citations(self):
     # fmt: off
     test_pairs = [
         # 1. P.R.R --> Correct abbreviation for a reporter.
         ('1 P.R.R. 1', [case_citation(reporter='P.R.R.')]),
         # 2. U. S. --> A simple variant to resolve.
         ('1 U. S. 1', [case_citation(reporter_found='U. S.')]),
         # 3. A.2d --> Not a variant, but needs to be looked up in the
         #    EDITIONS variable.
         ('1 A.2d 1', [case_citation(reporter='A.2d')]),
         # 4. A. 2d --> An unambiguous variant of an edition
         ('1 A. 2d 1',
          [case_citation(reporter='A.2d', reporter_found='A. 2d')]),
         # 5. P.R. --> A variant of 'Pen. & W.', 'P.R.R.', or 'P.' that's
         #    resolvable by year
         (
             '1 P.R. 1 (1831)',
             # Of the three, only Pen & W. was being published this year.
             [
                 case_citation(reporter='Pen. & W.',
                               year=1831,
                               reporter_found='P.R.')
             ]),
         # 5.1: W.2d --> A variant of an edition that either resolves to
         #      'Wis. 2d' or 'Wash. 2d' and is resolvable by year.
         (
             '1 W.2d 1 (1854)',
             # Of the two, only Wis. 2d was being published this year.
             [
                 case_citation(reporter='Wis. 2d',
                               year=1854,
                               reporter_found='W.2d')
             ]),
         # 5.2: Wash. --> A non-variant that has more than one reporter for
         #      the key, but is resolvable by year
         ('1 Wash. 1 (1890)', [case_citation(reporter='Wash.', year=1890)]),
         # 6. Cr. --> A variant of Cranch, which is ambiguous, except with
         #    paired with this variation.
         ('1 Cra. 1', [
             case_citation(reporter='Cranch',
                           reporter_found='Cra.',
                           metadata={'court': 'scotus'})
         ]),
         # 7. Cranch. --> Not a variant, but could refer to either Cranch's
         #    Supreme Court cases or his DC ones. In this case, we cannot
         #    disambiguate. Years are not known, and we have no further
         #    clues. We must simply drop Cranch from the results.
         ('1 Cranch 1 1 U.S. 23', [case_citation(page='23')]),
         # 8. Unsolved problem. In theory, we could use parallel citations
         #    to resolve this, because Rob is getting cited next to La., but
         #    we don't currently know the proximity of citations to each
         #    other, so can't use this.
         #  - Rob. --> Either:
         #                8.1: A variant of Robards (1862-1865) or
         #                8.2: Robinson's Louisiana Reports (1841-1846) or
         #                8.3: Robinson's Virgina Reports (1842-1865)
         # ('1 Rob. 1 1 La. 1',
         # [case_citation(volume='1', reporter='Rob.', page='1'),
         #  case_citation(volume='1', reporter='La.', page='1')]),
         # 9. Johnson #1 should pass and identify the citation
         ('1 Johnson 1 (1890)', [
             case_citation(
                 reporter='N.M. (J.)',
                 reporter_found='Johnson',
                 year=1890,
             )
         ]),
         # 10. Johnson #2 should fail to disambiguate with year alone
         ('1 Johnson 1 (1806)', []),
     ]
     # fmt: on
     # all tests in this suite require disambiguation:
     test_pairs = [
         pair + ({
             "remove_ambiguous": True
         }, ) for pair in test_pairs
     ]
     self.run_test_pairs(test_pairs, "Disambiguation")
示例#7
0
 def test_find_citations(self):
     """Can we find and make citation objects from strings?"""
     # fmt: off
     test_pairs = (
         # Basic test
         ('1 U.S. 1', [case_citation()]),
         # Basic test with a line break
         ('1 U.S.\n1', [case_citation()], {
             'clean': ['all_whitespace']
         }),
         # Basic test with a line break within a reporter
         ('1 U.\nS. 1', [case_citation(reporter_found='U. S.')], {
             'clean': ['all_whitespace']
         }),
         # Basic test of non-case name before citation (should not be found)
         ('lissner test 1 U.S. 1', [case_citation()]),
         # Test with plaintiff and defendant
         ('lissner v. test 1 U.S. 1', [
             case_citation(metadata={
                 'plaintiff': 'lissner',
                 'defendant': 'test'
             })
         ]),
         # Test with plaintiff, defendant and year
         ('lissner v. test 1 U.S. 1 (1982)', [
             case_citation(metadata={
                 'plaintiff': 'lissner',
                 'defendant': 'test'
             },
                           year=1982)
         ]),
         # Don't choke on misformatted year
         ('lissner v. test 1 U.S. 1 (198⁴)', [
             case_citation(metadata={
                 'plaintiff': 'lissner',
                 'defendant': 'test'
             })
         ]),
         # Test with different reporter than all of above.
         ('bob lissner v. test 1 F.2d 1 (1982)', [
             case_citation(reporter='F.2d',
                           year=1982,
                           metadata={
                               'plaintiff': 'lissner',
                               'defendant': 'test'
                           })
         ]),
         # Test with comma after defendant's name
         ('lissner v. test, 1 U.S. 1 (1982)', [
             case_citation(metadata={
                 'plaintiff': 'lissner',
                 'defendant': 'test'
             },
                           year=1982)
         ]),
         # Test with court and extra information
         ('bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)', [
             case_citation(page='12',
                           year=1982,
                           metadata={
                               'plaintiff': 'lissner',
                               'defendant': 'test',
                               'court': 'ca4',
                               'pin_cite': '347-348'
                           })
         ]),
         # Parallel cite with parenthetical
         ('bob lissner v. test 1 U.S. 12, 347-348, 1 S. Ct. 2, 358 (4th Cir. 1982) (overruling foo)',
          [
              case_citation(page='12',
                            year=1982,
                            metadata={
                                'plaintiff': 'lissner',
                                'defendant': 'test',
                                'court': 'ca4',
                                'pin_cite': '347-348',
                                'extra': "1 S. Ct. 2, 358",
                                'parenthetical': 'overruling foo'
                            }),
              case_citation(page='2',
                            reporter='S. Ct.',
                            year=1982,
                            metadata={
                                'plaintiff': 'lissner',
                                'defendant': 'test 1 U.S. 12, 347-348',
                                'court': 'ca4',
                                'pin_cite': '358',
                                'parenthetical': 'overruling foo'
                            }),
          ]),
         # Test full citation with nested parenthetical
         ('lissner v. test 1 U.S. 1 (1982) (discussing abc (Holmes, J., concurring))',
          [
              case_citation(metadata={
                  'plaintiff':
                  'lissner',
                  'defendant':
                  'test',
                  'parenthetical':
                  'discussing abc (Holmes, J., concurring)'
              },
                            year=1982)
          ]),
         # Test full citation with parenthetical and subsequent unrelated parenthetical
         ('lissner v. test 1 U.S. 1 (1982) (discussing abc); blah (something).',
          [
              case_citation(metadata={
                  'plaintiff': 'lissner',
                  'defendant': 'test',
                  'parenthetical': 'discussing abc'
              },
                            year=1982)
          ]),
         # Test with text before and after and a variant reporter
         ('asfd 22 U. S. 332 (1975) asdf', [
             case_citation(page='332',
                           volume='22',
                           reporter_found='U. S.',
                           year=1975)
         ]),
         # Test with finding reporter when it's a second edition
         ('asdf 22 A.2d 332 asdf',
          [case_citation(page='332', reporter='A.2d', volume='22')]),
         # Test if reporter in string will find proper citation string
         ('A.2d 332 11 A.2d 333',
          [case_citation(page='333', reporter='A.2d', volume='11')]),
         # Test finding a variant second edition reporter
         ('asdf 22 A. 2d 332 asdf', [
             case_citation(page='332',
                           reporter='A.2d',
                           volume='22',
                           reporter_found='A. 2d')
         ]),
         # Test finding a variant of an edition resolvable by variant alone.
         ('171 Wn.2d 1016', [
             case_citation(page='1016',
                           reporter='Wash. 2d',
                           volume='171',
                           reporter_found='Wn.2d')
         ]),
         # Test finding two citations where one of them has abutting
         # punctuation.
         ('2 U.S. 3, 4-5 (3 Atl. 33)', [
             case_citation(page='3',
                           volume='2',
                           metadata={'pin_cite': '4-5'}),
             case_citation(page='33',
                           reporter="A.",
                           volume='3',
                           reporter_found="Atl.")
         ]),
         # Test with the page number as a Roman numeral
         ('12 Neb. App. lxiv (2004)', [
             case_citation(page='lxiv',
                           reporter='Neb. Ct. App.',
                           volume='12',
                           reporter_found='Neb. App.',
                           year=2004)
         ]),
         # Test with page range with a weird suffix
         ('559 N.W.2d 826|N.D.',
          [case_citation(page='826', reporter='N.W.2d', volume='559')]),
         # Test with malformed/missing page number
         ('1 U.S. f24601', []),
         # Test with the 'digit-REPORTER-digit' corner-case formatting
         ('2007-NMCERT-008', [
             case_citation(source_text='2007-NMCERT-008',
                           page='008',
                           reporter='NMCERT',
                           volume='2007')
         ]),
         ('2006-Ohio-2095', [
             case_citation(source_text='2006-Ohio-2095',
                           page='2095',
                           reporter='Ohio',
                           volume='2006')
         ]),
         ('2017 IL App (4th) 160407', [
             case_citation(page='160407',
                           reporter='IL App (4th)',
                           volume='2017')
         ]),
         ('2017 IL App (1st) 143684-B', [
             case_citation(page='143684-B',
                           reporter='IL App (1st)',
                           volume='2017')
         ]),
         # Test first kind of short form citation (meaningless antecedent)
         ('before asdf 1 U. S., at 2', [
             case_citation(page='2',
                           reporter_found='U. S.',
                           short=True,
                           metadata={'antecedent_guess': 'asdf'})
         ]),
         # Test second kind of short form citation (meaningful antecedent)
         ('before asdf, 1 U. S., at 2', [
             case_citation(page='2',
                           reporter='U.S.',
                           reporter_found='U. S.',
                           short=True,
                           metadata={'antecedent_guess': 'asdf'})
         ]),
         # Test short form citation with preceding ASCII quotation
         ('before asdf,” 1 U. S., at 2',
          [case_citation(page='2', reporter_found='U. S.', short=True)]),
         # Test short form citation when case name looks like a reporter
         ('before Johnson, 1 U. S., at 2', [
             case_citation(page='2',
                           reporter_found='U. S.',
                           short=True,
                           metadata={'antecedent_guess': 'Johnson'})
         ]),
         # Test short form citation with no comma after reporter
         ('before asdf, 1 U. S. at 2', [
             case_citation(page='2',
                           reporter='U.S.',
                           reporter_found='U. S.',
                           short=True,
                           metadata={'antecedent_guess': 'asdf'})
         ]),
         # Test short form citation at end of document (issue #1171)
         ('before asdf, 1 U. S. end', []),
         # Test supra citation across line break
         ('before asdf, supra,\nat 2', [
             supra_citation("supra,",
                            metadata={
                                'pin_cite': 'at 2',
                                'antecedent_guess': 'asdf'
                            })
         ], {
             'clean': ['all_whitespace']
         }),
         # Test short form citation with a page range
         ('before asdf, 1 U. S., at 20-25', [
             case_citation(page='20',
                           reporter_found='U. S.',
                           short=True,
                           metadata={
                               'pin_cite': '20-25',
                               'antecedent_guess': 'asdf'
                           })
         ]),
         # Test short form citation with a page range with weird suffix
         ('before asdf, 1 U. S., at 20-25\\& n. 4', [
             case_citation(page='20',
                           reporter_found='U. S.',
                           short=True,
                           metadata={
                               'pin_cite': '20-25',
                               'antecedent_guess': 'asdf'
                           })
         ]),
         # Test short form citation with a parenthetical
         ('before asdf, 1 U. S., at 2 (overruling xyz)', [
             case_citation(page='2',
                           reporter='U.S.',
                           reporter_found='U. S.',
                           short=True,
                           metadata={
                               'antecedent_guess': 'asdf',
                               'parenthetical': 'overruling xyz'
                           })
         ]),
         # Test short form citation with no space before parenthetical
         ('before asdf, 1 U. S., at 2(overruling xyz)', [
             case_citation(page='2',
                           reporter='U.S.',
                           reporter_found='U. S.',
                           short=True,
                           metadata={
                               'antecedent_guess': 'asdf',
                               'parenthetical': 'overruling xyz'
                           })
         ]),
         # Test short form citation with nested parentheticals
         ('before asdf, 1 U. S., at 2 (discussing xyz (Holmes, J., concurring))',
          [
              case_citation(page='2',
                            reporter='U.S.',
                            reporter_found='U. S.',
                            short=True,
                            metadata={
                                'antecedent_guess':
                                'asdf',
                                'parenthetical':
                                'discussing xyz (Holmes, J., concurring)'
                            })
          ]),
         # Test that short form citation doesn't treat year as parenthetical
         ('before asdf, 1 U. S., at 2 (2016)', [
             case_citation(page='2',
                           reporter='U.S.',
                           reporter_found='U. S.',
                           short=True,
                           metadata={'antecedent_guess': 'asdf'})
         ]),
         # Test short form citation with page range and parenthetical
         ('before asdf, 1 U. S., at 20-25 (overruling xyz)', [
             case_citation(page='20',
                           reporter='U.S.',
                           reporter_found='U. S.',
                           short=True,
                           metadata={
                               'antecedent_guess': 'asdf',
                               'pin_cite': '20-25',
                               'parenthetical': 'overruling xyz'
                           })
         ]),
         # Test short form citation with subsequent unrelated parenthetical
         ('asdf, 1 U. S., at 4 (discussing abc). Some other nonsense (clarifying nonsense)',
          [
              case_citation(page='4',
                            reporter='U.S.',
                            reporter_found='U. S.',
                            short=True,
                            metadata={
                                'antecedent_guess': 'asdf',
                                'parenthetical': 'discussing abc'
                            })
          ]),
         # Test short form citation generated from non-standard regex for full cite
         ('1 Mich. at 1', [case_citation(reporter='Mich.', short=True)]),
         # Test parenthetical matching with multiple citations
         ('1 U. S., at 2. foo v. bar 3 U. S. 4 (2010) (overruling xyz).', [
             case_citation(page='2',
                           reporter='U.S.',
                           reporter_found='U. S.',
                           short=True,
                           volume='1',
                           metadata={'pin_cite': '2'}),
             case_citation(page='4',
                           reporter='U.S.',
                           reporter_found='U. S.',
                           short=False,
                           year=2010,
                           volume='3',
                           metadata={
                               'parenthetical': 'overruling xyz',
                               'plaintiff': 'foo',
                               'defendant': 'bar'
                           })
         ]),
         # Test with multiple citations and parentheticals
         ('1 U. S., at 2 (criticizing xyz). foo v. bar 3 U. S. 4 (2010) (overruling xyz).',
          [
              case_citation(page='2',
                            reporter='U.S.',
                            reporter_found='U. S.',
                            short=True,
                            volume='1',
                            metadata={
                                'pin_cite': '2',
                                'parenthetical': 'criticizing xyz'
                            }),
              case_citation(page='4',
                            reporter='U.S.',
                            reporter_found='U. S.',
                            short=False,
                            year=2010,
                            volume='3',
                            metadata={
                                'parenthetical': 'overruling xyz',
                                'plaintiff': 'foo',
                                'defendant': 'bar'
                            })
          ]),
         # Test first kind of supra citation (standard kind)
         ('before asdf, supra, at 2', [
             supra_citation("supra,",
                            metadata={
                                'pin_cite': 'at 2',
                                'antecedent_guess': 'asdf'
                            })
         ]),
         # Test second kind of supra citation (with volume)
         ('before asdf, 123 supra, at 2', [
             supra_citation("supra,",
                            metadata={
                                'pin_cite': 'at 2',
                                'volume': '123',
                                'antecedent_guess': 'asdf'
                            })
         ]),
         # Test third kind of supra citation (sans page)
         ('before asdf, supra, foo bar',
          [supra_citation("supra,", metadata={'antecedent_guess':
                                              'asdf'})]),
         # Test third kind of supra citation (with period)
         ('before asdf, supra. foo bar',
          [supra_citation("supra,", metadata={'antecedent_guess':
                                              'asdf'})]),
         # Test supra citation at end of document (issue #1171)
         ('before asdf, supra end',
          [supra_citation("supra,", metadata={'antecedent_guess':
                                              'asdf'})]),
         # Supra with parenthetical
         ('Foo, supra (overruling ...) (ignore this)', [
             supra_citation("supra",
                            metadata={
                                'antecedent_guess': 'Foo',
                                'parenthetical': 'overruling ...'
                            })
         ]),
         ('Foo, supra, at 2 (overruling ...)', [
             supra_citation("supra",
                            metadata={
                                'antecedent_guess': 'Foo',
                                'pin_cite': 'at 2',
                                'parenthetical': 'overruling ...'
                            })
         ]),
         # Test Ibid. citation
         ('foo v. bar 1 U.S. 12. asdf. Ibid. foo bar lorem ipsum.', [
             case_citation(page='12',
                           metadata={
                               'plaintiff': 'foo',
                               'defendant': 'bar'
                           }),
             id_citation('Ibid.')
         ]),
         # Test italicized Ibid. citation
         ('<p>before asdf. <i>Ibid.</i></p> <p>foo bar lorem</p>',
          [id_citation('Ibid.')], {
              'clean': ['html', 'inline_whitespace']
          }),
         # Test Id. citation
         ('foo v. bar 1 U.S. 12, 347-348. asdf. Id., at 123. foo bar', [
             case_citation(page='12',
                           metadata={
                               'plaintiff': 'foo',
                               'defendant': 'bar',
                               'pin_cite': '347-348'
                           }),
             id_citation('Id.,', metadata={'pin_cite': 'at 123'})
         ]),
         # Test Id. citation across line break
         ('foo v. bar 1 U.S. 12, 347-348. asdf. Id.,\nat 123. foo bar', [
             case_citation(page='12',
                           metadata={
                               'plaintiff': 'foo',
                               'defendant': 'bar',
                               'pin_cite': '347-348'
                           }),
             id_citation('Id.,', metadata={'pin_cite': 'at 123'})
         ], {
             'clean': ['all_whitespace']
         }),
         # Test italicized Id. citation
         ('<p>before asdf. <i>Id.,</i> at 123.</p> <p>foo bar</p>',
          [id_citation('Id.,', metadata={'pin_cite': 'at 123'})], {
              'clean': ['html', 'inline_whitespace']
          }),
         # Test italicized Id. citation with another HTML tag in the way
         ('<p>before asdf. <i>Id.,</i> at <b>123.</b></p> <p>foo bar</p>',
          [id_citation('Id.,', metadata={'pin_cite': 'at 123'})], {
              'clean': ['html', 'inline_whitespace']
          }),
         # Test weirder Id. citations (#1344)
         ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. ¶ 34. foo bar', [
             case_citation(page='12',
                           metadata={
                               'plaintiff': 'foo',
                               'defendant': 'bar',
                               'pin_cite': '347-348'
                           }),
             id_citation('Id.', metadata={'pin_cite': '¶ 34'})
         ]),
         ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. at 62-63, 67-68. f b', [
             case_citation(page='12',
                           metadata={
                               'plaintiff': 'foo',
                               'defendant': 'bar',
                               'pin_cite': '347-348'
                           }),
             id_citation('Id.', metadata={'pin_cite': 'at 62-63, 67-68'})
         ]),
         ('foo v. bar 1 U.S. 12, 347-348. asdf. Id., at *10. foo bar', [
             case_citation(page='12',
                           metadata={
                               'plaintiff': 'foo',
                               'defendant': 'bar',
                               'pin_cite': '347-348'
                           }),
             id_citation('Id.,', metadata={'pin_cite': 'at *10'})
         ]),
         ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. at 7-9, ¶¶ 38-53. f b',
          [
              case_citation(page='12',
                            metadata={
                                'plaintiff': 'foo',
                                'defendant': 'bar',
                                'pin_cite': '347-348'
                            }),
              id_citation('Id.', metadata={'pin_cite': 'at 7-9, ¶¶ 38-53'})
          ]),
         ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. at pp. 45, 64. foo bar',
          [
              case_citation(page='12',
                            metadata={
                                'plaintiff': 'foo',
                                'defendant': 'bar',
                                'pin_cite': '347-348'
                            }),
              id_citation('Id.', metadata={'pin_cite': 'at pp. 45, 64'})
          ]),
         ('foo v. bar 1 U.S. 12, 347-348. asdf. id. 119:12-14. foo bar', [
             case_citation(page='12',
                           metadata={
                               'plaintiff': 'foo',
                               'defendant': 'bar',
                               'pin_cite': '347-348'
                           }),
             id_citation('id.', metadata={'pin_cite': '119:12-14'})
         ]),
         # Test Id. citation without page number
         ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. No page number.', [
             case_citation(page='12',
                           metadata={
                               'plaintiff': 'foo',
                               'defendant': 'bar',
                               'pin_cite': '347-348'
                           }),
             id_citation('Id.')
         ]),
         # Id. with parenthetical
         ('Id. (overruling ...) (ignore this)', [
             id_citation("Id.",
                         metadata={'parenthetical': 'overruling ...'})
         ]),
         ('Id. at 2 (overruling ...)', [
             id_citation("Id.",
                         metadata={
                             'pin_cite': 'at 2',
                             'parenthetical': 'overruling ...'
                         })
         ]),
         # Test non-opinion citation
         ('lorem ipsum see §99 of the U.S. code.',
          [nonopinion_citation('§99')]),
         # Test address that's not a citation (#1338)
         (
             'lorem 111 S.W. 12th St.',
             [],
         ),
         (
             'lorem 111 N. W. 12th St.',
             [],
         ),
         # Test Conn. Super. Ct. regex variation.
         ('Failed to recognize 1993 Conn. Super. Ct. 5243-P', [
             case_citation(volume='1993',
                           reporter='Conn. Super. Ct.',
                           page='5243-P')
         ]),
         # Test that the tokenizer handles commas after a reporter. In the
         # past, " U. S. " would match but not " U. S., "
         ('foo 1 U.S., 1 bar', [case_citation()]),
         # Test reporter with custom regex
         ('blah blah Bankr. L. Rep. (CCH) P12,345. blah blah', [
             case_citation(volume=None,
                           reporter='Bankr. L. Rep.',
                           reporter_found='Bankr. L. Rep. (CCH)',
                           page='12,345')
         ]),
         ('blah blah, 2009 12345 (La.App. 1 Cir. 05/10/10). blah blah', [
             case_citation(volume='2009',
                           reporter='La.App. 1 Cir.',
                           page='12345',
                           groups={'date_filed': '05/10/10'})
         ]),
         # Token scanning edge case -- incomplete paren at end of input
         ('1 U.S. 1 (', [case_citation()]),
         # Token scanning edge case -- missing plaintiff name at start of input
         ('v. Bar, 1 U.S. 1',
          [case_citation(metadata={'defendant': 'Bar'})]),
         # Token scanning edge case -- short form start of input
         ('1 U.S., at 1', [case_citation(short=True)]),
         (', 1 U.S., at 1', [case_citation(short=True)]),
         # Token scanning edge case -- supra at start of input
         ('supra.', [supra_citation("supra.")]),
         (', supra.', [supra_citation("supra.")]),
         ('123 supra.',
          [supra_citation("supra.", metadata={'volume': "123"})]),
         # Token scanning edge case -- Id. at end of input
         ('Id.', [id_citation('Id.,')]),
         ('Id. at 1.', [id_citation('Id.,', metadata={'pin_cite':
                                                      'at 1'})]),
         ('Id. foo', [id_citation('Id.,')]),
         # Reject citations that are part of larger words
         (
             'foo1 U.S. 1, 1. U.S. 1foo',
             [],
         ),
         # Long pin cite -- make sure no catastrophic backtracking in regex
         ('1 U.S. 1, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2286, 2287, 2288, 2289, 2290, 2291',
          [
              case_citation(
                  metadata={
                      'pin_cite':
                      '2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2286, 2287, 2288, 2289, 2290, 2291'
                  })
          ]),
     )
     # fmt: on
     self.run_test_pairs(test_pairs, "Citation extraction")
示例#8
0
from cl.custom_filters.templatetags.text_filters import best_case_name
from cl.lib.scorched_utils import ExtraSolrInterface, ExtraSolrSearch
from cl.lib.types import (
    MatchedResourceType,
    SearchParam,
    SupportedCitationType,
)
from cl.search.models import Opinion

DEBUG = True

QUERY_LENGTH = 10

NO_MATCH_RESOURCE = Resource(
    case_citation(0, source_text="UNMATCHED_CITATION")
)


def build_date_range(start_year: int, end_year: int) -> str:
    """Build a date range to be handed off to a solr query."""
    start = datetime(start_year, 1, 1)
    end = datetime(end_year, 12, 31)
    date_range = "[%sZ TO %sZ]" % (start.isoformat(), end.isoformat())
    return date_range


def make_name_param(defendant: str, plaintiff: str = None) -> Tuple[str, int]:
    """Remove punctuation and return cleaned string plus its length in tokens."""
    token_list = defendant.split()
    if plaintiff: