示例#1
0
def disambiguate_reporters(
    citations: List[Union[Citation, NonopinionCitation]]
) -> List[Union[Citation, NonopinionCitation]]:
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key,
          which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop
    it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Only disambiguate citations with a reporter
        if not isinstance(citation, (FullCitation, ShortformCitation)):
            unambiguous_citations.append(citation)
            continue

        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue

            # Multiple books under this key, but which is correct?
            if citation.year:
                # attempt resolution by date
                possible_citations = []
                rep_len = len(REPORTERS[EDITIONS[citation.reporter]])
                for i in range(0, rep_len):
                    if is_date_in_reporter(
                            REPORTERS[EDITIONS[citation.reporter]][i]
                        ["editions"],
                            citation.year,
                    ):
                        possible_citations.append((citation.reporter, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit
                    # after filtering by year.
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[
                    citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue

                # Multiple reporters under a single misspelled key
                # (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                #                          Washington Reports).
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    rep_can = len(REPORTERS[citation.canonical_reporter])
                    for i in range(0, rep_can):
                        if is_date_in_reporter(
                                REPORTERS[citation.canonical_reporter][i]
                            ["editions"],
                                citation.year,
                        ):
                            possible_citations.append((citation.reporter, i))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after
                        # filtering by year.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
                # Attempt resolution by unique variation
                # (e.g. Cr. can only be Cranch[0])
                possible_citations = []
                reps = REPORTERS[citation.canonical_reporter]
                for i in range(0, len(reps)):
                    for variation in REPORTERS[citation.canonical_reporter][i][
                            "variations"].items():
                        if variation[0] == cached_variation:
                            possible_citations.append((variation[1], i))
                if len(possible_citations) == 1:
                    # We were able to find a single match after filtering
                    # by variation.
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of
                        # reporters under the key.
                        key = REPORTERS[EDITIONS[reporter_key]]
                        if citation.year:
                            cite_year = citation.year
                            if is_date_in_reporter(key[i]["editions"],
                                                   cite_year):
                                possible_citations.append((reporter_key, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by
                    # year.
                    citation.canonical_reporter = EDITIONS[
                        possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations
示例#2
0
def get_date_filed(clean_html_tree, citations, case_path=None, court=None):
    path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]'

    # Get a reasonable date range based on reporters in the citations.
    reporter_keys = [citation.reporter for citation in citations]
    range_dates = []
    for reporter_key in reporter_keys:
        for reporter in REPORTERS.get(EDITIONS.get(reporter_key)):
            try:
                range_dates.extend(reporter['editions'][reporter_key])
            except KeyError:
                # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition
                # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0].
                pass
    if range_dates:
        start, end = min(range_dates) - timedelta(
            weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52)
        if end > now():
            end = now()

    dates = []
    for e in clean_html_tree.xpath(path):
        text = tostring(e, method='text', encoding='unicode')
        # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge
        # the word at, and anything after it.
        text = re.sub(' at .*', '', text)

        # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format.
        text = re.sub('\d{5,}', '', text)

        # The parser can't handle 'Sept.' so we tweak it.
        text = text.replace('Sept.', 'Sep.')

        # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08.
        re_match = re.search('\d{1,2}, \d{1,2}, \d{4}', text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12
        # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/
        re_match = re.search('\d{1,2}-\d{1,2}, \d{4}', text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this.
        if 'denied' in text.lower():
            continue

        try:
            if range_dates:
                found = parse_dates.parse_dates(text,
                                                sane_start=start,
                                                sane_end=end)
            else:
                found = parse_dates.parse_dates(text, sane_end=now())
            if found:
                dates.extend(found)
        except UnicodeEncodeError:
            # If it has unicode is crashes dateutil's parser, but is unlikely to be the date.
            pass

    # Get the date from our SCOTUS date table
    scotus_dates_found = []
    if not dates and court == 'scotus':
        for citation in citations:
            try:
                # Scotus dates are in the form of a list, since a single citation can refer to several dates.
                found = scotus_dates["%s %s %s" %
                                     (citation.volume, citation.reporter,
                                      citation.page)]
                if len(found) == 1:
                    scotus_dates_found.extend(found)
            except KeyError:
                pass
        if len(scotus_dates_found) == 1:
            dates = scotus_dates_found

    if not dates:
        # Try to grab the year from the citations, if it's the same in all of them.
        years = set([citation.year for citation in citations if citation.year])
        if len(years) == 1:
            dates.append(datetime.datetime(list(years)[0], 1, 1))

    if not dates:
        try:
            dates = fixes[case_path]['dates']
        except KeyError:
            if 'input_dates' in DEBUG:
                #subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate()
                print '  No date found for: file://%s' % case_path
                input_date = raw_input('  What should be here (YYYY-MM-DD)? ')
                add_fix(case_path, {
                    'dates':
                    [datetime.datetime.strptime(input_date, '%Y-%m-%d')]
                })
                dates = [datetime.datetime.strptime(input_date, '%Y-%m-%d')]
            if 'log_bad_dates' in DEBUG:
                # Write the failed case out to file.
                with open('missing_dates.txt', 'a') as out:
                    out.write('%s\n' % case_path)

    if dates:
        if 'date' in DEBUG:
            log_print("  Using date: %s of dates found: %s" %
                      (max(dates), dates))
        return max(dates)
    else:
        if 'date' in DEBUG:
            log_print("  No dates found")
        return []
示例#3
0
def get_date_filed(clean_html_tree, citations, case_path=None, court=None):
    path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]'

    # Get a reasonable date range based on reporters in the citations.
    reporter_keys = [citation.reporter for citation in citations]
    range_dates = []
    for reporter_key in reporter_keys:
        for reporter in REPORTERS.get(EDITIONS.get(reporter_key)):
            try:
                range_dates.extend(reporter["editions"][reporter_key])
            except KeyError:
                # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition
                # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0].
                pass
    if range_dates:
        start, end = min(range_dates) - timedelta(weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52)
        if end > now():
            end = now()

    dates = []
    for e in clean_html_tree.xpath(path):
        text = tostring(e, method="text", encoding="unicode")
        # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge
        # the word at, and anything after it.
        text = re.sub(" at .*", "", text)

        # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format.
        text = re.sub("\d{5,}", "", text)

        # The parser can't handle 'Sept.' so we tweak it.
        text = text.replace("Sept.", "Sep.")

        # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08.
        re_match = re.search("\d{1,2}, \d{1,2}, \d{4}", text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12
        # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/
        re_match = re.search("\d{1,2}-\d{1,2}, \d{4}", text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this.
        if "denied" in text.lower():
            continue

        try:
            if range_dates:
                found = parse_dates.parse_dates(text, sane_start=start, sane_end=end)
            else:
                found = parse_dates.parse_dates(text, sane_end=now())
            if found:
                dates.extend(found)
        except UnicodeEncodeError:
            # If it has unicode is crashes dateutil's parser, but is unlikely to be the date.
            pass

    # Get the date from our SCOTUS date table
    scotus_dates_found = []
    if not dates and court == "scotus":
        for citation in citations:
            try:
                # Scotus dates are in the form of a list, since a single citation can refer to several dates.
                found = scotus_dates["%s %s %s" % (citation.volume, citation.reporter, citation.page)]
                if len(found) == 1:
                    scotus_dates_found.extend(found)
            except KeyError:
                pass
        if len(scotus_dates_found) == 1:
            dates = scotus_dates_found

    if not dates:
        # Try to grab the year from the citations, if it's the same in all of them.
        years = set([citation.year for citation in citations if citation.year])
        if len(years) == 1:
            dates.append(datetime.datetime(list(years)[0], 1, 1))

    if not dates:
        try:
            dates = fixes[case_path]["dates"]
        except KeyError:
            if "input_dates" in DEBUG:
                # subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate()
                print "  No date found for: file://%s" % case_path
                input_date = raw_input("  What should be here (YYYY-MM-DD)? ")
                add_fix(case_path, {"dates": [datetime.datetime.strptime(input_date, "%Y-%m-%d")]})
                dates = [datetime.datetime.strptime(input_date, "%Y-%m-%d")]
            if "log_bad_dates" in DEBUG:
                # Write the failed case out to file.
                with open("missing_dates.txt", "a") as out:
                    out.write("%s\n" % case_path)

    if dates:
        if "date" in DEBUG:
            log_print("  Using date: %s of dates found: %s" % (max(dates), dates))
        return max(dates)
    else:
        if "date" in DEBUG:
            log_print("  No dates found")
        return []
示例#4
0
def disambiguate_reporters(citations):
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key,
          which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop
    it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue
            else:
                # Multiple books under this key, but which is correct?
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    for i in range(0, len(REPORTERS[EDITIONS[citation.reporter]])):
                        if is_date_in_reporter(REPORTERS[EDITIONS[citation.reporter]][i]["editions"], citation.year):
                            possible_citations.append((citation.reporter, i))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after filtering by year.
                        citation.reporter = possible_citations[0][0]
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue
                else:
                    # Multiple reporters under a single misspelled key
                    # (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                    #                          Washington Reports).
                    if citation.year:
                        # attempt resolution by date
                        possible_citations = []
                        for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                            if is_date_in_reporter(
                                REPORTERS[citation.canonical_reporter][i]["editions"], citation.year
                            ):
                                possible_citations.append((citation.reporter, i))
                        if len(possible_citations) == 1:
                            # We were able to identify only one hit after filtering by year.
                            citation.lookup_index = possible_citations[0][1]
                            unambiguous_citations.append(citation)
                            continue
                    # Attempt resolution by unique variation (e.g. Cr. can only be Cranch[0])
                    possible_citations = []
                    for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                        for variation in REPORTERS[citation.canonical_reporter][i]["variations"].items():
                            if variation[0] == cached_variation:
                                possible_citations.append((variation[1], i))
                    if len(possible_citations) == 1:
                        # We were able to find a single match after filtering by variation.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of reporters under the key.
                        if is_date_in_reporter(REPORTERS[EDITIONS[reporter_key]][i]["editions"], citation.year):
                            possible_citations.append((reporter_key, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by year.
                    citation.canonical_reporter = EDITIONS[possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations