Exemplo n.º 1
0
def test_extract_citations(case_factory, tmpdir, settings, elasticsearch):
    from scripts.extract_cites import EDITIONS as processed_editions
    settings.MISSED_CITATIONS_DIR = str(tmpdir)
    blocked_by_date = set(
        k for k in list(EDITIONS.keys()) + list(VARIATIONS_ONLY.keys())
        if all(c['start_year'] > 2000 for c in processed_editions[k]))
    legitimate_cites = [
        "225 F. Supp. 552",  # correct
        ["125 f supp 152", "125 F. Supp. 152"],  # normalized
        ["125 Burnett (Wis.) 152", "125 Bur. 152"],  # normalized
        ["1 F. 2d 2", "1 F.2d 2"],  # not matched as "1 F. 2"
        "2 1/2 Mass. 1",  # special volume numbers
        "3 Suppl. Mass. 2",  # special volume numbers
        "1 La.App. 5 Cir. 2",  # not matched as "1 La.App. 5"
        "2000 WL 12345",  # vendor cite
    ]
    legitimate_cites += [
        "1 %s 1" % c for c in EDITIONS.keys() if c not in blocked_by_date
    ]
    legitimate_cites += [["1 %s 1" % k, "1 %s 1" % v]
                         for k, vv in VARIATIONS_ONLY.items() for v in vv
                         if k not in blocked_by_date]
    legitimate_cites_normalized = set(
        normalize_cite(c if type(c) is str else c[1])
        for c in legitimate_cites)
    legitimate_cites = [
        c if type(c) is str else c[0] for c in legitimate_cites
    ]
    illegitimate_cites = [
        "2 Dogs 3",  # unrecognized reporter
        "3 Dogs 4",  # duplicate unrecognized reporter
        "1 or 2",  # not matched as 1 Or. 2
        "word1 Mass. 2word",  # not matched if part of larger word
        "1 Mass.\n 2",  # no match across newlines
        "1 A.3d 1",  # no match to reporter that started publishing in 2010
    ]
    illegitimate_cites += ["1 %s 1" % c for c in blocked_by_date]
    case = case_factory(
        body_cache__text=", some text, ".join(legitimate_cites +
                                              illegitimate_cites),
        decision_date=datetime(2000, 1, 1))
    fabfile.extract_all_citations()
    update_elasticsearch_from_queue()

    # check extracted cites
    cites = list(ExtractedCitation.objects.all())
    cite_set = set(c.cite for c in cites)
    normalized_cite_set = set(c.normalized_cite for c in cites)
    assert cite_set == set(legitimate_cites)
    assert normalized_cite_set == legitimate_cites_normalized
    assert all(c.cited_by_id == case.pk for c in cites)
Exemplo n.º 2
0
def get_citations(text, html=True, do_post_citation=True, do_defendant=True):
    if html:
        text = get_visible_text(text)
    words = reporter_tokenizer.tokenize(text)
    citations = []
    # Exclude first and last tokens when looking for reporters, because valid
    # citations must have a volume before and a page number after the reporter.
    for i in xrange(1, len(words) - 1):
        # Find reporter
        if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()):
            citation = extract_base_citation(words, i)
            if citation is None:
                # Not a valid citation; continue looking
                continue
            if do_post_citation:
                add_post_citation(citation, words, i)
            if do_defendant:
                add_defendant(citation, words, i)
            citations.append(citation)

    # Disambiguate or drop all the reporters
    citations = disambiguate_reporters(citations)

    for citation in citations:
        if not citation.court and is_scotus_reporter(citation):
            citation.court = "scotus"

    return citations
Exemplo n.º 3
0
def tokenize(text):
    """Tokenize text using regular expressions in the following steps:
     - Split the text by the occurrences of patterns which match a federal
       reporter, including the reporter strings as part of the resulting
       list.
     - Perform simple tokenization (whitespace split) on each of the
       non-reporter strings in the list.

    Example:
    >>>tokenize('See Roe v. Wade, 410 U. S. 113 (1973)')
    ['See', 'Roe', 'v.', 'Wade,', '410', 'U. S.', '113', '(1973)']
    """
    # if the text looks likes the corner-case 'digit-REPORTER-digit', splitting
    # by spaces doesn't work
    if re.match(r"\d+\-[A-Za-z]+\-\d+", text):
        return text.split("-")
    # otherwise, we just split on spaces to find words
    strings = REPORTER_RE.split(text)
    words = []
    for string in strings:
        if string in list(EDITIONS.keys()) + list(VARIATIONS_ONLY.keys()):
            words.append(string)
        else:
            # Normalize spaces
            words.extend(_tokenize(string))
    return words
Exemplo n.º 4
0
def get_citations(text,
                  html=True,
                  do_post_citation=True,
                  do_defendant=True,
                  disambiguate=True):
    if html:
        text = get_visible_text(text)
    words = reporter_tokenizer.tokenize(text)
    citations = []
    # Exclude first and last tokens when looking for reporters, because valid
    # citations must have a volume before and a page after the reporter.
    for i in xrange(0, len(words) - 1):
        # Find reporter
        if words[i] in (EDITIONS.keys() + VARIATIONS_ONLY.keys()):
            citation = extract_base_citation(words, i)
            if citation is None:
                # Not a valid citation; continue looking
                continue
            if do_post_citation:
                add_post_citation(citation, words)
            if do_defendant:
                add_defendant(citation, words)
            citations.append(citation)

    if disambiguate:
        # Disambiguate or drop all the reporters
        citations = disambiguate_reporters(citations)

    for citation in citations:
        if not citation.court and is_scotus_reporter(citation):
            citation.court = 'scotus'

    return citations
def tokenize(text):
    """Tokenize text using regular expressions in the following steps:
        - Split the text by the occurrences of patterns which match a federal
          reporter, including the reporter strings as part of the resulting
          list.
        - Perform simple tokenization (whitespace split) on each of the
          non-reporter strings in the list.

       Example:
       >>>tokenize('See Roe v. Wade, 410 U. S. 113 (1973)')
       ['See', 'Roe', 'v.', 'Wade,', '410', 'U.S.', '113', '(1973)']
    """
    # if the text looks likes the corner-case 'digit-REPORTER-digit', splitting
    # by spaces doesn't work
    if re.match('\d+\-[A-Za-z]+\-\d+', text):
        return text.split('-')
    # otherwise, we just split on spaces to find words
    strings = REPORTER_RE.split(text)
    words = []
    for string in strings:
        if string in EDITIONS.keys() + VARIATIONS_ONLY.keys():
            words.append(string)
        else:
            # Normalize spaces
            words.extend(_tokenize(string))
    return words
Exemplo n.º 6
0
 def test_no_variation_is_same_as_key(self):
     """Are any variations identical to the keys they're supposed to be
     variations of?
     """
     for variation, keys in VARIATIONS_ONLY.items():
         for key in keys:
             self.assertNotEqual(
                 variation, key,
                 "The variation '%s' is identical to the key it's supposed "
                 "to be a variation of." % variation)
Exemplo n.º 7
0
 def test_for_variations_mapping_to_bad_keys(self):
     """Do we have a variation that maps to a key that doesn't exist in the
     first place?
     """
     for variations in VARIATIONS_ONLY.values():
         for variation in variations:
             self.assertIn(
                 EDITIONS[variation], REPORTERS.keys(),
                 msg="Could not map variation to a valid reporter: %s" %
                     variation
             )
Exemplo n.º 8
0
 def test_for_variations_mapping_to_bad_keys(self):
     """Do we have a variation that maps to a key that doesn't exist in the
     first place?
     """
     for variations in VARIATIONS_ONLY.values():
         for variation in variations:
             self.assertIn(
                 EDITIONS[variation],
                 REPORTERS.keys(),
                 msg="Could not map variation to a valid reporter: %s" %
                 variation)
Exemplo n.º 9
0
 def test_no_variation_is_same_as_key(self):
     """Are any variations identical to the keys they're supposed to be
     variations of?
     """
     for variation, keys in VARIATIONS_ONLY.items():
         for key in keys:
             self.assertNotEqual(
                 variation,
                 key,
                 "The variation '%s' is identical to the key it's supposed "
                 "to be a variation of." % variation
             )
Exemplo n.º 10
0
def tokenize(text):
    """Tokenize text using regular expressions in the following steps:
        - Split the text by the occurrences of patterns which match a federal
          reporter, including the reporter strings as part of the resulting
          list.
        - Perform simple tokenization (whitespace split) on each of the
          non-reporter strings in the list.

       Example:
       >>>tokenize('See Roe v. Wade, 410 U. S. 113 (1973)')
       ['See', 'Roe', 'v.', 'Wade,', '410', 'U.S.', '113', '(1973)']
    """
    strings = REPORTER_RE.split(text)
    words = []
    for string in strings:
        if string in EDITIONS.keys() + VARIATIONS_ONLY.keys():
            words.append(string)
        else:
            # Normalize spaces
            words.extend(_tokenize(string))
    return words
Exemplo n.º 11
0
def normalize_variation(string):
    """Gets the best possible canonicalization of a variant spelling of a
    reporter.

    Variations map to lists of one or more result, and we need to figure out
    which is best. Usually, this can be accomplished using the year of the
    item.
    """
    if string in VARIATIONS_ONLY.keys():
        if len(VARIATIONS_ONLY[string]) == 1:
            # Simple case
            return VARIATIONS_ONLY[string][0]
        else:
            # Hard case, resolve the variation or return as is.
            # TODO: This must be fixed or else all resolutionsn are resolved
            # the same way --> BAD! Once fixed, it will probably need to be
            # removed from the tokenizer, and moved down the pipeline.
            return VARIATIONS_ONLY[string][0]
    else:
        # Not a variant
        return string
Exemplo n.º 12
0
def tokenize(text):
    """Tokenize text using regular expressions in the following steps:
        - Split the text by the occurrences of patterns which match a federal
          reporter, including the reporter strings as part of the resulting
          list.
        - Perform simple tokenization (whitespace split) on each of the
          non-reporter strings in the list.

       Example:
       >>>tokenize('See Roe v. Wade, 410 U. S. 113 (1973)')
       ['See', 'Roe', 'v.', 'Wade,', '410', 'U.S.', '113', '(1973)']
    """
    strings = REPORTER_RE.split(text)
    words = []
    for string in strings:
        if string in EDITIONS.keys() + VARIATIONS_ONLY.keys():
            words.append(string)
        else:
            # Normalize spaces
            words.extend(_tokenize(string))
    return words
def normalize_variation(string):
    """Gets the best possible canonicalization of a variant spelling of a
    reporter.

    Variations map to lists of one or more result, and we need to figure out
    which is best. Usually, this can be accomplished using the year of the
    item.
    """
    if string in VARIATIONS_ONLY.keys():
        if len(VARIATIONS_ONLY[string]) == 1:
            # Simple case
            return VARIATIONS_ONLY[string][0]
        else:
            # Hard case, resolve the variation or return as is.
            # TODO: This must be fixed or else all resolutionsn are resolved
            # the same way --> BAD! Once fixed, it will probably need to be
            # removed from the tokenizer, and moved down the pipeline.
            return VARIATIONS_ONLY[string][0]
    else:
        # Not a variant
        return string
Exemplo n.º 14
0
def reporter_or_volume_handler(request, reporter, volume=None):
    """Show all the volumes for a given reporter abbreviation or all the cases
    for a reporter-volume dyad.

    Two things going on here:
    1. We don't know which reporter the user actually wants when they provide
       an ambiguous abbreviation. Just show them all.
    2. We want to also show off that we know all these reporter abbreviations.
    """
    root_reporter = EDITIONS.get(reporter)
    if not root_reporter:
        return throw_404(request, {
            'no_reporters': True,
            'reporter': reporter,
            'private': True,
        })

    volume_names = [r['name'] for r in REPORTERS[root_reporter]]
    variation_names = {}
    variation_abbrevs = VARIATIONS_ONLY.get(reporter, [])
    for abbrev in variation_abbrevs:
        for r in REPORTERS[abbrev]:
            if r['name'] not in volume_names:
                variation_names[r['name']] = abbrev

    if volume is None:
        # Show all the volumes for the case
        volumes_in_reporter = list(Citation.objects
                                   .filter(reporter=reporter)
                                   .order_by('reporter', 'volume')
                                   .values_list('volume', flat=True)
                                   .distinct())

        if not volumes_in_reporter:
            return throw_404(request, {
                'no_volumes': True,
                'reporter': reporter,
                'volume_names': volume_names,
                'private': True,
            })

        return render(
            request,
            'volumes_for_reporter.html',
            {
                'reporter': reporter,
                'volume_names': volume_names,
                'volumes': volumes_in_reporter,
                'variation_names': variation_names,
                'private': False,
            },
        )
    else:
        # Show all the cases for a volume-reporter dyad
        cases_in_volume = (OpinionCluster.objects
                           .filter(citations__reporter=reporter,
                                   citations__volume=volume)
                           .order_by('date_filed', 'citations__page'))

        if not cases_in_volume:
            return throw_404(request, {
                'no_cases': True,
                'reporter': reporter,
                'volume_names': volume_names,
                'volume': volume,
                'private': True,
            })

        paginator = Paginator(cases_in_volume, 250, orphans=5)
        page = request.GET.get('page')
        try:
            cases = paginator.page(page)
        except PageNotAnInteger:
            cases = paginator.page(1)
        except EmptyPage:
            cases = paginator.page(paginator.num_pages)

        return render(request, 'volumes_for_reporter.html', {
            'cases': cases,
            'reporter': reporter,
            'variation_names': variation_names,
            'volume': volume,
            'volume_names': volume_names,
            'private': True,
        })
Exemplo n.º 15
0
def get_citations(
    text: str,
    html: bool = True,
    do_post_citation: bool = True,
    do_defendant: bool = True,
    disambiguate: bool = True,
) -> List[Union[NonopinionCitation, Citation]]:
    """Main function"""
    if html:
        text = get_visible_text(text)
    words = tokenize(text)
    citations: List[Union[Citation, NonopinionCitation]] = []

    for i in range(0, len(words) - 1):
        citation_token = words[i]
        citation: Union[Citation, NonopinionCitation, None] = None

        # CASE 1: Citation token is a reporter (e.g., "U. S.").
        # In this case, first try extracting it as a standard, full citation,
        # and if that fails try extracting it as a short form citation.
        if citation_token in list(EDITIONS.keys()) + list(
                VARIATIONS_ONLY.keys()):
            citation = extract_full_citation(words, i)
            if citation:
                # CASE 1A: Standard citation found, try to add additional data
                if do_post_citation:
                    add_post_citation(citation, words)
                if do_defendant:
                    add_defendant(citation, words)
            else:
                # CASE 1B: Standard citation not found, so see if this
                # reference to a reporter is a short form citation instead
                citation = extract_shortform_citation(words, i)

                if not citation:
                    # Neither a full nor short form citation
                    continue

        # CASE 2: Citation token is an "Id." or "Ibid." reference.
        # In this case, the citation should simply be to the item cited
        # immediately prior, but for safety we will leave that resolution up
        # to the user.
        elif citation_token.lower() in {"id.", "id.,", "ibid."}:
            citation = extract_id_citation(words, i)

        # CASE 3: Citation token is a "supra" reference.
        # In this case, we're not sure yet what the citation's antecedent is.
        # It could be any of the previous citations above. Thus, like an Id.
        # citation, for safety we won't resolve this reference yet.
        elif strip_punct(citation_token.lower()) == "supra":
            citation = extract_supra_citation(words, i)

        # CASE 4: Citation token is a section marker.
        # In this case, it's likely that this is a reference to a non-
        # opinion document. So we record this marker in order to keep
        # an accurate list of the possible antecedents for id citations.
        elif "§" in citation_token:
            citation = NonopinionCitation(match_token=citation_token)

        # CASE 5: The token is not a citation.
        else:
            continue

        if citation is not None:
            citations.append(citation)

    # Disambiguate each citation's reporter
    if disambiguate:
        citations = disambiguate_reporters(citations)

    citations = remove_address_citations(citations)

    # Set each citation's court property to "scotus" by default
    for citation in citations:
        if (isinstance(citation, Citation) and not citation.court
                and is_scotus_reporter(citation)):
            citation.court = "scotus"

    # Returns a list of citations ordered in the sequence that they appear in
    # the document. The ordering of this list is important for reconstructing
    # the references of the ShortformCitation, SupraCitation, and
    # IdCitation objects.
    return citations
Exemplo n.º 16
0
def disambiguate_reporters(
    citations: List[Union[Citation, NonopinionCitation]]
) -> List[Union[Citation, NonopinionCitation]]:
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key,
          which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop
    it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Only disambiguate citations with a reporter
        if not isinstance(citation, (FullCitation, ShortformCitation)):
            unambiguous_citations.append(citation)
            continue

        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue

            # Multiple books under this key, but which is correct?
            if citation.year:
                # attempt resolution by date
                possible_citations = []
                rep_len = len(REPORTERS[EDITIONS[citation.reporter]])
                for i in range(0, rep_len):
                    if is_date_in_reporter(
                            REPORTERS[EDITIONS[citation.reporter]][i]
                        ["editions"],
                            citation.year,
                    ):
                        possible_citations.append((citation.reporter, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit
                    # after filtering by year.
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[
                    citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue

                # Multiple reporters under a single misspelled key
                # (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                #                          Washington Reports).
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    rep_can = len(REPORTERS[citation.canonical_reporter])
                    for i in range(0, rep_can):
                        if is_date_in_reporter(
                                REPORTERS[citation.canonical_reporter][i]
                            ["editions"],
                                citation.year,
                        ):
                            possible_citations.append((citation.reporter, i))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after
                        # filtering by year.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
                # Attempt resolution by unique variation
                # (e.g. Cr. can only be Cranch[0])
                possible_citations = []
                reps = REPORTERS[citation.canonical_reporter]
                for i in range(0, len(reps)):
                    for variation in REPORTERS[citation.canonical_reporter][i][
                            "variations"].items():
                        if variation[0] == cached_variation:
                            possible_citations.append((variation[1], i))
                if len(possible_citations) == 1:
                    # We were able to find a single match after filtering
                    # by variation.
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of
                        # reporters under the key.
                        key = REPORTERS[EDITIONS[reporter_key]]
                        if citation.year:
                            cite_year = citation.year
                            if is_date_in_reporter(key[i]["editions"],
                                                   cite_year):
                                possible_citations.append((reporter_key, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by
                    # year.
                    citation.canonical_reporter = EDITIONS[
                        possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations
Exemplo n.º 17
0
def get_citations(
    text,
    html=True,
    do_post_citation=True,
    do_defendant=True,
    disambiguate=True,
):
    if html:
        text = get_visible_text(text)
    words = reporter_tokenizer.tokenize(text)
    citations = []

    for i in xrange(0, len(words) - 1):
        citation_token = words[i]

        # CASE 1: Citation token is a reporter (e.g., "U. S.").
        # In this case, first try extracting it as a standard, full citation,
        # and if that fails try extracting it as a short form citation.
        if citation_token in (EDITIONS.keys() + VARIATIONS_ONLY.keys()):
            citation = extract_full_citation(words, i)
            if citation:
                # CASE 1A: Standard citation found, try to add additional data
                if do_post_citation:
                    add_post_citation(citation, words)
                if do_defendant:
                    add_defendant(citation, words)
            else:
                # CASE 1B: Standard citation not found, so see if this
                # reference to a reporter is a short form citation instead
                citation = extract_shortform_citation(words, i)

                if not citation:
                    # Neither a full nor short form citation
                    continue

        # CASE 2: Citation token is an "Id." or "Ibid." reference.
        # In this case, the citation is simply to the immediately previous
        # document, but for safety we won't make that resolution until the
        # previous citation has been successfully matched to an opinion.
        elif citation_token.lower() in {"id.", "id.,", "ibid."}:
            citation = extract_id_citation(words, i)

        # CASE 3: Citation token is a "supra" reference.
        # In this case, we're not sure yet what the citation's antecedent is.
        # It could be any of the previous citations above. Thus, like an Id.
        # citation, we won't be able to resolve this reference until the
        # previous citations are actually matched to opinions.
        elif strip_punct(citation_token.lower()) == "supra":
            citation = extract_supra_citation(words, i)

        # CASE 4: Citation token is a section marker.
        # In this case, it's likely that this is a reference to a non-
        # opinion document. So we record this marker in order to keep
        # an accurate list of the possible antecedents for id citations.
        elif u"§" in citation_token:
            citation = NonopinionCitation(match_token=citation_token)

        # CASE 5: The token is not a citation.
        else:
            continue

        citations.append(citation)

    # Disambiguate each citation's reporter
    if disambiguate:
        citations = disambiguate_reporters(citations)

    citations = remove_address_citations(citations)

    # Set each citation's court property to "scotus" by default
    for citation in citations:
        if (isinstance(citation, Citation) and not citation.court
                and is_scotus_reporter(citation)):
            citation.court = "scotus"

    # Returns a list of citations ordered in the sequence that they appear in
    # the document. The ordering of this list is important because we will
    # later rely on that order to reconstruct the references of the
    # ShortformCitation, SupraCitation, and IdCitation objects.
    return citations
Exemplo n.º 18
0
#!/usr/bin/env python
# encoding: utf-8

# Loosely adapted from the Natural Language Toolkit: Tokenizers
# URL: <http://nltk.sourceforge.net>

import re

from reporters_db import EDITIONS, VARIATIONS_ONLY

# We need to build a REGEX that has all the variations and the reporters in
# order from longest to shortest.
REGEX_LIST = list(EDITIONS.keys()) + list(VARIATIONS_ONLY.keys())
REGEX_LIST.sort(key=len, reverse=True)
REGEX_STR = "|".join(map(re.escape, REGEX_LIST))
REPORTER_RE = re.compile(r"(^|\s)(%s)(\s|,)" % REGEX_STR)


def normalize_variation(string):
    """Gets the best possible canonicalization of a variant spelling of a
    reporter.

    Variations map to lists of one or more result, and we need to figure out
    which is best. Usually, this can be accomplished using the year of the
    item.
    """
    if string in VARIATIONS_ONLY.keys():
        if len(VARIATIONS_ONLY[string]) == 1:
            # Simple case
            return VARIATIONS_ONLY[string][0]
        else:
Exemplo n.º 19
0
def reporter_or_volume_handler(request, reporter, volume=None):
    """Show all the volumes for a given reporter abbreviation or all the cases
    for a reporter-volume dyad.

    Two things going on here:
    1. We don't know which reporter the user actually wants when they provide
       an ambiguous abbreviation. Just show them all.
    2. We want to also show off that we know all these reporter abbreviations.
    """
    root_reporter = EDITIONS.get(reporter)
    if not root_reporter:
        return throw_404(
            request,
            {
                "no_reporters": True,
                "reporter": reporter,
                "private": True,
            },
        )

    volume_names = [r["name"] for r in REPORTERS[root_reporter]]
    variation_names = {}
    variation_abbrevs = VARIATIONS_ONLY.get(reporter, [])
    for abbrev in variation_abbrevs:
        for r in REPORTERS[abbrev]:
            if r["name"] not in volume_names:
                variation_names[r["name"]] = abbrev

    if volume is None:
        # Show all the volumes for the case
        volumes_in_reporter = list(
            Citation.objects.filter(reporter=reporter).order_by(
                "reporter", "volume").values_list("volume",
                                                  flat=True).distinct())

        if not volumes_in_reporter:
            return throw_404(
                request,
                {
                    "no_volumes": True,
                    "reporter": reporter,
                    "volume_names": volume_names,
                    "private": True,
                },
            )

        return render(
            request,
            "volumes_for_reporter.html",
            {
                "reporter": reporter,
                "volume_names": volume_names,
                "volumes": volumes_in_reporter,
                "variation_names": variation_names,
                "private": False,
            },
        )
    else:
        # Show all the cases for a volume-reporter dyad
        cases_in_volume = OpinionCluster.objects.filter(
            citations__reporter=reporter,
            citations__volume=volume).order_by("date_filed", "citations__page")

        if not cases_in_volume:
            return throw_404(
                request,
                {
                    "no_cases": True,
                    "reporter": reporter,
                    "volume_names": volume_names,
                    "volume": volume,
                    "private": True,
                },
            )

        paginator = Paginator(cases_in_volume, 250, orphans=5)
        page = request.GET.get("page")
        try:
            cases = paginator.page(page)
        except PageNotAnInteger:
            cases = paginator.page(1)
        except EmptyPage:
            cases = paginator.page(paginator.num_pages)

        return render(
            request,
            "volumes_for_reporter.html",
            {
                "cases": cases,
                "reporter": reporter,
                "variation_names": variation_names,
                "volume": volume,
                "volume_names": volume_names,
                "private": True,
            },
        )
#!/usr/bin/env python
# encoding: utf-8

# Loosely adapted from the Natural Language Toolkit: Tokenizers
# URL: <http://nltk.sourceforge.net>

import re

from reporters_db import EDITIONS, VARIATIONS_ONLY

# We need to build a REGEX that has all the variations and the reporters in
# order from longest to shortest.
REGEX_LIST = EDITIONS.keys() + VARIATIONS_ONLY.keys()
REGEX_LIST.sort(key=len, reverse=True)
REGEX_STR = '|'.join(map(re.escape, REGEX_LIST))
REPORTER_RE = re.compile("(^|\s)(%s)\s" % REGEX_STR)


def normalize_variation(string):
    """Gets the best possible canonicalization of a variant spelling of a
    reporter.

    Variations map to lists of one or more result, and we need to figure out
    which is best. Usually, this can be accomplished using the year of the
    item.
    """
    if string in VARIATIONS_ONLY.keys():
        if len(VARIATIONS_ONLY[string]) == 1:
            # Simple case
            return VARIATIONS_ONLY[string][0]
        else:
Exemplo n.º 21
0
def disambiguate_reporters(citations):
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key,
          which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop
    it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue
            else:
                # Multiple books under this key, but which is correct?
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    for i in range(0, len(REPORTERS[EDITIONS[citation.reporter]])):
                        if is_date_in_reporter(REPORTERS[EDITIONS[citation.reporter]][i]["editions"], citation.year):
                            possible_citations.append((citation.reporter, i))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after filtering by year.
                        citation.reporter = possible_citations[0][0]
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue
                else:
                    # Multiple reporters under a single misspelled key
                    # (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                    #                          Washington Reports).
                    if citation.year:
                        # attempt resolution by date
                        possible_citations = []
                        for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                            if is_date_in_reporter(
                                REPORTERS[citation.canonical_reporter][i]["editions"], citation.year
                            ):
                                possible_citations.append((citation.reporter, i))
                        if len(possible_citations) == 1:
                            # We were able to identify only one hit after filtering by year.
                            citation.lookup_index = possible_citations[0][1]
                            unambiguous_citations.append(citation)
                            continue
                    # Attempt resolution by unique variation (e.g. Cr. can only be Cranch[0])
                    possible_citations = []
                    for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                        for variation in REPORTERS[citation.canonical_reporter][i]["variations"].items():
                            if variation[0] == cached_variation:
                                possible_citations.append((variation[1], i))
                    if len(possible_citations) == 1:
                        # We were able to find a single match after filtering by variation.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of reporters under the key.
                        if is_date_in_reporter(REPORTERS[EDITIONS[reporter_key]][i]["editions"], citation.year):
                            possible_citations.append((reporter_key, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by year.
                    citation.canonical_reporter = EDITIONS[possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations
#!/usr/bin/env python
# encoding: utf-8

# Loosely adapted from the Natural Language Toolkit: Tokenizers
# URL: <http://nltk.sourceforge.net>

import re

from reporters_db import EDITIONS, VARIATIONS_ONLY

# We need to build a REGEX that has all the variations and the reporters in
# order from longest to shortest.
REGEX_LIST = EDITIONS.keys() + VARIATIONS_ONLY.keys()
REGEX_LIST.sort(key=len, reverse=True)
REGEX_STR = '|'.join(map(re.escape, REGEX_LIST))
REPORTER_RE = re.compile("\s(%s)\s" % REGEX_STR)


def normalize_variation(string):
    """Gets the best possible canonicalization of a variant spelling of a
    reporter.

    Variations map to lists of one or more result, and we need to figure out
    which is best. Usually, this can be accomplished using the year of the
    item.
    """
    if string in VARIATIONS_ONLY.keys():
        if len(VARIATIONS_ONLY[string]) == 1:
            # Simple case
            return VARIATIONS_ONLY[string][0]
        else: