示例#1
0
 def test_that_all_dates_are_converted_to_dates_not_strings(self):
     """Do we properly make the ISO-8601 date strings into Python dates?"""
     for reporter_name, reporter_list in REPORTERS.iteritems():
         # reporter_name == "A."
         # reporter_list == [
         # {'name': 'Atlantic Reporter', 'editions': ...},
         # {'name': 'Aldo's Reporter', 'editions': ...}
         # ]
         for reporter_dict in reporter_list:
             # reporter_dict == {'name': 'Atlantic Reporter'}
             for e_name, e_dates in reporter_dict['editions'].iteritems():
                 # e_name == "A. 2d"
                 # e_dates == {
                 #     "end": "1938-12-31T00:00:00",
                 #     "start": "1885-01-01T00:00:00"
                 # }
                 for key in ['start', 'end']:
                     is_date_or_none = (
                         isinstance(e_dates[key], datetime.datetime) or
                         e_dates[key] is None
                     )
                     self.assertTrue(
                         is_date_or_none,
                         msg=("%s dates in the reporter '%s' appear to be "
                              "coming through as '%s'" %
                              (key, e_name, type(e_dates[key])))
                     )
示例#2
0
    def test_all_required_keys_no_extra_keys(self):
        """Are all required keys present? Are there any keys present that
        shouldn't be?
        """
        required_fields = [
            'cite_type', 'editions', 'mlz_jurisdiction', 'name', 'variations'
        ]
        optional_fields = ['publisher', 'notes', 'href']
        all_fields = required_fields + optional_fields
        for reporter_abbv, reporter_list in REPORTERS.items():
            for reporter_data in reporter_list:

                # All required fields present?
                for required_field in required_fields:
                    try:
                        reporter_data[required_field]
                    except KeyError:
                        self.fail("Reporter '%s' lacks required field '%s'" %
                                  (reporter_abbv, required_field))

                # No extra fields?
                for k in reporter_data.keys():
                    self.assertIn(
                        k, all_fields,
                        "Reporter '%s' has an unknown field '%s'" %
                        (reporter_abbv, k))

                # No empty string values?
                for k, v in reporter_data.items():
                    if isinstance(v, str):
                        self.assertTrue(
                            v != "",
                            msg="Field '%s' is empty in reporter '%s'" %
                            (k, reporter_abbv))
示例#3
0
    def test_all_required_keys_no_extra_keys(self):
        """Are all required keys present? Are there any keys present that
        shouldn't be?
        """
        required_fields = ['cite_type', 'editions', 'mlz_jurisdiction', 'name',
                           'variations']
        optional_fields = ['publisher', 'notes', 'href']
        all_fields = required_fields + optional_fields
        for reporter_abbv, reporter_list in REPORTERS.items():
            for reporter_data in reporter_list:

                # All required fields present?
                for required_field in required_fields:
                    try:
                        reporter_data[required_field]
                    except KeyError:
                        self.fail("Reporter '%s' lacks required field '%s'" % (
                            reporter_abbv, required_field
                        ))

                # No extra fields?
                for k in reporter_data.keys():
                    self.assertIn(
                        k,
                        all_fields,
                        "Reporter '%s' has an unknown field '%s'" % (
                            reporter_abbv, k
                        )
                    )
示例#4
0
 def test_all_reporters_have_valid_cite_type(self):
     """Do all reporters have valid cite_type values?"""
     for reporter_abbv, reporter_list in REPORTERS.items():
         for reporter_data in reporter_list:
             self.assertIn(
                 reporter_data['cite_type'],
                 VALID_CITE_TYPES,
                 "%s did not have a valid cite_type value" % reporter_abbv,
             )
示例#5
0
 def test_all_reporters_have_valid_cite_type(self):
     """Do all reporters have valid cite_type values?"""
     for reporter_abbv, reporter_list in REPORTERS.items():
         for reporter_data in reporter_list:
             self.assertIn(
                 reporter_data['cite_type'],
                 VALID_CITE_TYPES,
                 "%s did not have a valid cite_type value" % reporter_abbv,
             )
示例#6
0
 def test_any_keys_missing_editions(self):
     """Have we added any new reporters that lack a matching edition?"""
     for r_name, r_items in REPORTERS.items():
         # For each reporter
         for item in r_items:
             # and each book in each reporter
             self.assertIn(
                 r_name, item['editions'],
                 msg="Could not find edition for key: %s" % r_name
             )
示例#7
0
 def test_any_keys_missing_editions(self):
     """Have we added any new reporters that lack a matching edition?"""
     for r_name, r_items in REPORTERS.items():
         # For each reporter
         for item in r_items:
             # and each book in each reporter
             self.assertIn(r_name,
                           item['editions'],
                           msg="Could not find edition for key: %s" %
                           r_name)
示例#8
0
 def test_for_variations_mapping_to_bad_keys(self):
     """Do we have a variation that maps to a key that doesn't exist in the
     first place?
     """
     for variations in VARIATIONS_ONLY.values():
         for variation in variations:
             self.assertIn(
                 EDITIONS[variation], REPORTERS.keys(),
                 msg="Could not map variation to a valid reporter: %s" %
                     variation
             )
示例#9
0
 def test_for_variations_mapping_to_bad_keys(self):
     """Do we have a variation that maps to a key that doesn't exist in the
     first place?
     """
     for variations in VARIATIONS_ONLY.values():
         for variation in variations:
             self.assertIn(
                 EDITIONS[variation],
                 REPORTERS.keys(),
                 msg="Could not map variation to a valid reporter: %s" %
                 variation)
示例#10
0
 def test_nothing_ends_before_it_starts(self):
     """Do any editions have end dates before their start dates?"""
     for reporter_dicts in REPORTERS.values():
         # Each value is a list of reporter dictionaries
         for reporter in reporter_dicts:
             # Each edition is a dict of keys that go to more dicts!
             for k, edition in reporter['editions'].items():
                 if edition['start'] and edition['end']:
                     self.assertLessEqual(
                         edition['start'],
                         edition['end'],
                         msg="It appears that edition %s ends before it "
                         "starts." % k)
示例#11
0
 def test_nothing_ends_before_it_starts(self):
     """Do any editions have end dates before their start dates?"""
     for reporter_dicts in REPORTERS.values():
         # Each value is a list of reporter dictionaries
         for reporter in reporter_dicts:
             # Each edition is a dict of keys that go to more dicts!
             for k, edition in reporter['editions'].items():
                 if edition['start'] and edition['end']:
                     self.assertLessEqual(
                         edition['start'],
                         edition['end'],
                         msg="It appears that edition %s ends before it "
                             "starts." % k
                     )
示例#12
0
def make_csv():
    with open('reporters.csv', 'w') as f:
        out = csv.DictWriter(f, fieldnames=FIELDNAMES)
        out.writeheader()

        for cite, reporter_list in REPORTERS.items():
            print("Adding: %s" % cite)
            for reporter in reporter_list:
                d = make_editions_dict(reporter['editions'])
                d['citation'] = cite
                d['name'] = reporter['name']
                d['publisher'] = reporter.get('publisher', '')
                d['cite_type'] = reporter['cite_type']
                d['mlz_jurisdictions'] = ", ".join(reporter['mlz_jurisdiction'])
                d['variations'] = ", ".join(reporter['variations'].keys())
                d['href'] = reporter.get('href', '')
                d['notes'] = reporter.get('notes', '')

                out.writerow(d)
示例#13
0
文件: make_csv.py 项目: limc/project
def make_csv():
    with open('reporters.csv', 'w') as f:
        out = csv.DictWriter(f, fieldnames=FIELDNAMES)
        out.writeheader()

        for cite, reporter_list in REPORTERS.items():
            print("Adding: %s" % cite)
            for reporter in reporter_list:
                d = make_editions_dict(reporter['editions'])
                d['citation'] = cite
                d['name'] = reporter['name']
                d['publisher'] = reporter.get('publisher', '')
                d['cite_type'] = reporter['cite_type']
                d['mlz_jurisdictions'] = ", ".join(
                    reporter['mlz_jurisdiction'])
                d['variations'] = ", ".join(reporter['variations'].keys())
                d['href'] = reporter.get('href', '')
                d['notes'] = reporter.get('notes', '')

                out.writerow(d)
示例#14
0
def make_csv():
    with open("reporters.csv", "w") as f:
        out = csv.DictWriter(f, fieldnames=FIELDNAMES)
        out.writeheader()

        for cite, reporter_list in REPORTERS.items():
            print("Adding: %s" % cite)
            for reporter in reporter_list:
                d = make_editions_dict(reporter["editions"])
                d["citation"] = cite
                d["name"] = reporter["name"]
                d["publisher"] = reporter.get("publisher", "")
                d["cite_type"] = reporter["cite_type"]
                d["mlz_jurisdictions"] = ", ".join(
                    reporter["mlz_jurisdiction"])
                d["variations"] = ", ".join(reporter["variations"].keys())
                d["href"] = reporter.get("href", "")
                d["notes"] = reporter.get("notes", "")

                out.writerow(d)
示例#15
0
 def all_series():
     for reporter_list in REPORTERS.itervalues():
         for reporter in reporter_list:
             for k in reporter["editions"].keys(
             ) + reporter["variations"].keys():
                 yield k
示例#16
0
def iter_reporters():
    for reporter_abbv, reporter_list in REPORTERS.items():
        for reporter_data in reporter_list:
            yield reporter_abbv, reporter_list, reporter_data
def aggregate_reporters():
    makedirs(dest_dir)
    aggregate = {}

    # get map of reporter key to canonical name in FLP db
    flp_keys = {}
    for reporter_list in REPORTERS.itervalues():
        for reporter in reporter_list:
            fields = [reporter['cite_type'], reporter['name']]
            for k in reporter["editions"].keys():
                flp_keys[cite_to_key(k)] = fields + [k]
            for k, v in reporter["variations"].items():
                flp_keys[cite_to_key(k)] = fields + [v]

    # get map of reporter key to name in Juris-M db
    juris_keys = {}
    for json_file, label in [[
            '../lib/jurism-abbreviations/primary-us.json', 'primary'
    ], ['../lib/jurism-abbreviations/secondary-us-bluebook.json',
            'secondary']]:
        data = json.load(
            open(os.path.join(os.path.dirname(__file__), json_file)))
        for juris in data["xdata"].itervalues():
            for full_name, short_name in juris["container-title"].iteritems():
                key = cite_to_key(short_name)
                if key not in juris_keys:
                    juris_keys[key] = [label, short_name, full_name]

    # get map of reporter key to CAP reporter
    cap_keys = {}
    for reporter in json.load(
            open(
                os.path.join(os.path.dirname(__file__),
                             '../lib/reporter-list/reporters.json'))):
        key = cite_to_key(reporter['short'])
        if key not in cap_keys:
            cap_keys[key] = [reporter['reporter'], reporter['short']]

    # aggregate rows in our collected citations
    for csv_path in tqdm(sorted(glob(os.path.join(source_dir, "*.csv")))):
        csvreader = csv.DictReader(open(csv_path))
        for row in csvreader:
            key = cite_to_key(row['Series'])
            if key in aggregate:
                aggregate[key]['Count'] += int(row['Count'])
            else:
                row['Examples'] = ['', '', '']
                row['Count'] = int(row['Count'])
                row['Series'] = key
                row['FLP'] = flp_keys.get(key, ['', '', ''])
                row['juris'] = juris_keys.get(key, ['', '', ''])
                row['CAP'] = cap_keys.get(key, ['', ''])

                aggregate[key] = row

            aggregate[key]['Examples'] = [
                row['Example %s' % i]
                for i in [1, 2, 3] if row.get('Example %s' % i)
            ] + aggregate[key]['Examples']

    # write to CSV
    out = [[k, v['Count']] + v['Examples'][:3] + v['CAP'] + v['FLP'] +
           v['juris'] for k, v in aggregate.iteritems() if v['Count'] >= 100]
    out.sort(key=lambda x: x[1], reverse=True)
    with open(os.path.join(dest_dir, 'aggregate.csv'), 'wb') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow([
            'Series',
            'Count',
            'Example 1',
            'Example 2',
            'Example 3',
            'CAP Cite',
            'CAP Full',
            'FLP Type',
            'FLP Name',
            'FLP Cite',
            'Juris-M Type',
            'Juris-M Cite',
            'Juris-M Full',
        ])
        for row in out:
            csvwriter.writerow([unicode(s).encode("utf-8") for s in row])
示例#18
0
 def all_series():
     for reporter_list in REPORTERS.itervalues():
         for reporter in reporter_list:
             for k in reporter["editions"].keys() + reporter["variations"].keys():
                 yield k
示例#19
0
def _populate_reporter_extractors():
    """Populate EXTRACTORS and EDITIONS_LOOKUP."""

    # Set up regex replacement variables from reporters-db
    raw_regex_variables = deepcopy(RAW_REGEX_VARIABLES)
    raw_regex_variables["full_cite"][""] = "$volume $reporter,? $page"
    raw_regex_variables["page"][""] = rf"(?P<page>{PAGE_NUMBER_REGEX})"
    regex_variables = process_variables(raw_regex_variables)

    def _substitute_edition(template, *edition_names):
        """Helper to replace $edition in template with edition_names."""
        edition = "|".join(re.escape(e) for e in edition_names)
        return Template(template).safe_substitute(edition=edition)

    # Extractors step one: add an extractor for each reporter string

    # Build a lookup of regex -> edition.
    # Keys in this dict will be regular expressions to handle a
    # particular reporter string, like (simplified)
    # r"(?P<volume>\d+) (?P<reporter>U\.S\.) (?P<page>\d+)"
    editions_by_regex = defaultdict(
        # Values in this dict will be:
        lambda: {
            # Exact matches. If the regex is "\d+ U.S. \d+",
            # this will be [Edition("U.S.")]
            "editions": [],
            # Variants. If the regex matches "\d+ U. S. \d+",
            # this will be [Edition("U.S.")]
            "variations": [],
            # Strings a text must contain for this regex to match.
            # If the regex is "\d+ S.E. 2d \d+",
            # this will be {"S.E. 2d"}
            "strings": set(),
            # Whether this regex results in a short cite:
            "short": False,
        }
    )

    def _add_regex(
        kind: str,
        reporters: List[str],
        edition: Edition,
        regex: str,
    ):
        """Helper to generate citations for a reporter
        and insert into editions_by_regex."""
        for reporter in reporters:
            EDITIONS_LOOKUP[reporter].append(edition)
        editions_by_regex[regex][kind].append(edition)

        # add strings
        have_strings = re.escape(reporters[0]) in regex
        if have_strings:
            editions_by_regex[regex]["strings"].update(reporters)

        # add short cite
        short_cite_regex = short_cite_re(regex)
        if short_cite_regex != regex:
            editions_by_regex[short_cite_regex][kind].append(edition)
            editions_by_regex[short_cite_regex]["short"] = True
            if have_strings:
                editions_by_regex[short_cite_regex]["strings"].update(
                    reporters
                )

    def _add_regexes(
        regex_templates: List[str],
        edition_name: str,
        edition: Edition,
        variations: List[str],
    ):
        """Expand regex_templates and add to editions_by_regex."""
        for regex_template in regex_templates:
            regex_template = recursive_substitute(
                regex_template, regex_variables
            )
            regex = _substitute_edition(regex_template, edition_name)
            _add_regex("editions", [edition_name], edition, regex)
            if variations:
                regex = _substitute_edition(regex_template, *variations)
                _add_regex(
                    "variations",
                    variations,
                    edition,
                    regex,
                )

    # add reporters.json:
    for source_key, source_cluster in REPORTERS.items():
        for source in source_cluster:
            reporter_obj = Reporter(
                short_name=source_key,
                name=source["name"],
                cite_type=source["cite_type"],
                source="reporters",
            )
            variations = source["variations"]
            for edition_name, edition_data in source["editions"].items():
                edition = Edition(
                    short_name=edition_name,
                    reporter=reporter_obj,
                    start=edition_data["start"],
                    end=edition_data["end"],
                )
                regex_templates = edition_data.get("regexes") or ["$full_cite"]
                edition_variations = [
                    k for k, v in variations.items() if v == edition_name
                ]
                _add_regexes(
                    regex_templates, edition_name, edition, edition_variations
                )

    # add laws.json
    for source_key, source_cluster in LAWS.items():
        for source in source_cluster:
            reporter_obj = Reporter(
                short_name=source_key,
                name=source["name"],
                cite_type=source["cite_type"],
                source="laws",
            )
            edition = Edition(
                short_name=source_key,
                reporter=reporter_obj,
                start=source["start"],
                end=source["end"],
            )
            regex_templates = source.get("regexes") or ["$full_cite"]
            # handle citation to multiple sections, like
            # "Mass. Gen. Laws ch. 1, §§ 2-3":
            regex_templates = [
                r.replace(r"§ ", r"§§? ?") for r in regex_templates
            ]
            _add_regexes(
                regex_templates,
                source_key,
                edition,
                source.get("variations", []),
            )

    # add journals.json
    for source_key, source_cluster in JOURNALS.items():
        for source in source_cluster:
            reporter_obj = Reporter(
                short_name=source_key,
                name=source["name"],
                cite_type=source["cite_type"],
                source="journals",
            )
            edition = Edition(
                short_name=source_key,
                reporter=reporter_obj,
                start=source["start"],
                end=source["end"],
            )
            regex_templates = source.get("regexes") or ["$full_cite"]
            _add_regexes(
                regex_templates,
                source_key,
                edition,
                source.get("variations", []),
            )

    # Add each regex to EXTRACTORS:
    for regex, cluster in editions_by_regex.items():
        EXTRACTORS.append(
            TokenExtractor(
                nonalphanum_boundaries_re(regex),
                CitationToken.from_match,
                extra={
                    "exact_editions": cluster["editions"],
                    "variation_editions": cluster["variations"],
                    "short": cluster["short"],
                },
                strings=list(cluster["strings"]),
            )
        )

    # Extractors step two:
    # Add a few one-off extractors to handle special token types
    # other than citations:

    EXTRACTORS.extend(
        [
            # Id.
            TokenExtractor(
                ID_REGEX,
                IdToken.from_match,
                flags=re.I,
                strings=["id.", "ibid."],
            ),
            # supra
            TokenExtractor(
                SUPRA_REGEX,
                SupraToken.from_match,
                flags=re.I,
                strings=["supra"],
            ),
            # paragraph
            TokenExtractor(
                PARAGRAPH_REGEX,
                ParagraphToken.from_match,
            ),
            # case name stopwords
            TokenExtractor(
                STOP_WORD_REGEX,
                StopWordToken.from_match,
                flags=re.I,
                strings=STOP_WORDS,
            ),
            # tokens containing section symbols
            TokenExtractor(
                SECTION_REGEX, SectionToken.from_match, strings=["§"]
            ),
        ]
    )
示例#20
0
def disambiguate_reporters(
    citations: List[Union[Citation, NonopinionCitation]]
) -> List[Union[Citation, NonopinionCitation]]:
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key,
          which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop
    it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Only disambiguate citations with a reporter
        if not isinstance(citation, (FullCitation, ShortformCitation)):
            unambiguous_citations.append(citation)
            continue

        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue

            # Multiple books under this key, but which is correct?
            if citation.year:
                # attempt resolution by date
                possible_citations = []
                rep_len = len(REPORTERS[EDITIONS[citation.reporter]])
                for i in range(0, rep_len):
                    if is_date_in_reporter(
                            REPORTERS[EDITIONS[citation.reporter]][i]
                        ["editions"],
                            citation.year,
                    ):
                        possible_citations.append((citation.reporter, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit
                    # after filtering by year.
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[
                    citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue

                # Multiple reporters under a single misspelled key
                # (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                #                          Washington Reports).
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    rep_can = len(REPORTERS[citation.canonical_reporter])
                    for i in range(0, rep_can):
                        if is_date_in_reporter(
                                REPORTERS[citation.canonical_reporter][i]
                            ["editions"],
                                citation.year,
                        ):
                            possible_citations.append((citation.reporter, i))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after
                        # filtering by year.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
                # Attempt resolution by unique variation
                # (e.g. Cr. can only be Cranch[0])
                possible_citations = []
                reps = REPORTERS[citation.canonical_reporter]
                for i in range(0, len(reps)):
                    for variation in REPORTERS[citation.canonical_reporter][i][
                            "variations"].items():
                        if variation[0] == cached_variation:
                            possible_citations.append((variation[1], i))
                if len(possible_citations) == 1:
                    # We were able to find a single match after filtering
                    # by variation.
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of
                        # reporters under the key.
                        key = REPORTERS[EDITIONS[reporter_key]]
                        if citation.year:
                            cite_year = citation.year
                            if is_date_in_reporter(key[i]["editions"],
                                                   cite_year):
                                possible_citations.append((reporter_key, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by
                    # year.
                    citation.canonical_reporter = EDITIONS[
                        possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations
示例#21
0
def get_date_filed(clean_html_tree, citations, case_path=None, court=None):
    path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]'

    # Get a reasonable date range based on reporters in the citations.
    reporter_keys = [citation.reporter for citation in citations]
    range_dates = []
    for reporter_key in reporter_keys:
        for reporter in REPORTERS.get(EDITIONS.get(reporter_key)):
            try:
                range_dates.extend(reporter["editions"][reporter_key])
            except KeyError:
                # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition
                # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0].
                pass
    if range_dates:
        start, end = min(range_dates) - timedelta(weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52)
        if end > now():
            end = now()

    dates = []
    for e in clean_html_tree.xpath(path):
        text = tostring(e, method="text", encoding="unicode")
        # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge
        # the word at, and anything after it.
        text = re.sub(" at .*", "", text)

        # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format.
        text = re.sub("\d{5,}", "", text)

        # The parser can't handle 'Sept.' so we tweak it.
        text = text.replace("Sept.", "Sep.")

        # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08.
        re_match = re.search("\d{1,2}, \d{1,2}, \d{4}", text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12
        # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/
        re_match = re.search("\d{1,2}-\d{1,2}, \d{4}", text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this.
        if "denied" in text.lower():
            continue

        try:
            if range_dates:
                found = parse_dates.parse_dates(text, sane_start=start, sane_end=end)
            else:
                found = parse_dates.parse_dates(text, sane_end=now())
            if found:
                dates.extend(found)
        except UnicodeEncodeError:
            # If it has unicode is crashes dateutil's parser, but is unlikely to be the date.
            pass

    # Get the date from our SCOTUS date table
    scotus_dates_found = []
    if not dates and court == "scotus":
        for citation in citations:
            try:
                # Scotus dates are in the form of a list, since a single citation can refer to several dates.
                found = scotus_dates["%s %s %s" % (citation.volume, citation.reporter, citation.page)]
                if len(found) == 1:
                    scotus_dates_found.extend(found)
            except KeyError:
                pass
        if len(scotus_dates_found) == 1:
            dates = scotus_dates_found

    if not dates:
        # Try to grab the year from the citations, if it's the same in all of them.
        years = set([citation.year for citation in citations if citation.year])
        if len(years) == 1:
            dates.append(datetime.datetime(list(years)[0], 1, 1))

    if not dates:
        try:
            dates = fixes[case_path]["dates"]
        except KeyError:
            if "input_dates" in DEBUG:
                # subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate()
                print "  No date found for: file://%s" % case_path
                input_date = raw_input("  What should be here (YYYY-MM-DD)? ")
                add_fix(case_path, {"dates": [datetime.datetime.strptime(input_date, "%Y-%m-%d")]})
                dates = [datetime.datetime.strptime(input_date, "%Y-%m-%d")]
            if "log_bad_dates" in DEBUG:
                # Write the failed case out to file.
                with open("missing_dates.txt", "a") as out:
                    out.write("%s\n" % case_path)

    if dates:
        if "date" in DEBUG:
            log_print("  Using date: %s of dates found: %s" % (max(dates), dates))
        return max(dates)
    else:
        if "date" in DEBUG:
            log_print("  No dates found")
        return []
示例#22
0
def get_editions(REPORTERS=None):
    """
        Process REPORTERS into a lookup dict from reporter string or normalized reporter string to edition data.

        Given:
        >>> REPORTERS = {
        ...     'Foo': [
        ...         {
        ...             'editions': {
        ...                 'Foo': {"start": datetime(1900, 1, 1), "end": datetime(1910, 1, 1)},
        ...                 'Foo 2d.': {"start": datetime(1910, 1, 1), "end": None},
        ...             },
        ...             'variations': {
        ...                 'foo': 'Foo',
        ...                 'Food': 'Foo',
        ...             }
        ...         },
        ...         {
        ...             'editions': {
        ...                 'Foo': {"start": datetime(1800, 1, 1), "end": datetime(1810, 1, 1)},
        ...             },
        ...             'variations': {
        ...                 'foo': 'Foo',
        ...                 'Fool': 'Foo',
        ...             }
        ...         },
        ...     ]
        ... }

        Each reporter string maps to its possible resolutions, sorted in reverse-end-date order:
        >>> editions = get_editions(REPORTERS)
        >>> foo1 = {'reporter': 'Foo', 'start_year': 1900, 'end': datetime(1910, 1, 1)}
        >>> foo2d = {'reporter': 'Foo 2d.', 'start_year': 1910, 'end': datetime(9999, 1, 1)}
        >>> foo2 = {'reporter': 'Foo', 'start_year': 1800, 'end': datetime(1810, 1, 1)}
        >>> assert editions == {
        ...     'Foo': [foo1, foo2],
        ...     'foo': [foo1, foo2],
        ...     'Foo 2d.': [foo2d],
        ...     'foo2d': [foo2d],
        ...     'Food': [foo1],
        ...     'food': [foo1],
        ...     'Fool': [foo2],
        ...     'fool': [foo2],
        ... }
    """
    if REPORTERS is None:
        from reporters_db import REPORTERS
    editions = defaultdict(list)
    not_ended_date = datetime(9999, 1, 1)
    unknown_start_date = datetime(1750, 1, 1)

    def append(k, v):
        for key in (k, normalize_cite(k)):
            if v not in editions[key]:
                editions[key].append(v)

    for reporter_cluster in REPORTERS.values():
        for reporter in reporter_cluster:
            local_editions = {}
            for k, v in reporter["editions"].items():
                local_editions[k] = edition = {
                    'reporter':
                    k,
                    'start_year':
                    0 if v['start'] == unknown_start_date else v['start'].year,
                    'end':
                    v['end'] or not_ended_date,
                }
                append(k, edition)
            for k, v in reporter["variations"].items():
                append(k, local_editions[v])

    # sort candidates for each string: first prefer exact matches, then editions that ended more recently
    for edition_key, candidates in editions.items():
        candidates.sort(reverse=True,
                        key=lambda c: (c['reporter'] == edition_key, c['end']))

    return editions
示例#23
0
def get_date_filed(clean_html_tree, citations, case_path=None, court=None):
    path = '//center[descendant::text()[not(starts-with(normalize-space(.), "No.") or starts-with(normalize-space(.), "Case No.") or starts-with(normalize-space(.), "Record No."))]]'

    # Get a reasonable date range based on reporters in the citations.
    reporter_keys = [citation.reporter for citation in citations]
    range_dates = []
    for reporter_key in reporter_keys:
        for reporter in REPORTERS.get(EDITIONS.get(reporter_key)):
            try:
                range_dates.extend(reporter['editions'][reporter_key])
            except KeyError:
                # Fails when a reporter_key points to more than one reporter, one of which doesn't have the edition
                # queried. For example, Wash. 2d isn't in REPORTERS['Wash.']['editions'][0].
                pass
    if range_dates:
        start, end = min(range_dates) - timedelta(
            weeks=(20 * 52)), max(range_dates) + timedelta(weeks=20 * 52)
        if end > now():
            end = now()

    dates = []
    for e in clean_html_tree.xpath(path):
        text = tostring(e, method='text', encoding='unicode')
        # Items like "February 4, 1991, at 9:05 A.M." stump the lexer in the date parser. Consequently, we purge
        # the word at, and anything after it.
        text = re.sub(' at .*', '', text)

        # The parser recognizes numbers like 121118 as a date. This corpus does not have dates in that format.
        text = re.sub('\d{5,}', '', text)

        # The parser can't handle 'Sept.' so we tweak it.
        text = text.replace('Sept.', 'Sep.')

        # The parser recognizes dates like December 3, 4, 1908 as 2004-12-3 19:08.
        re_match = re.search('\d{1,2}, \d{1,2}, \d{4}', text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # The parser recognizes dates like October 12-13, 1948 as 2013-10-12, not as 1948-10-12
        # See: https://www.courtlistener.com/scotus/9ANY/grand-river-dam-authority-v-grand-hydro/
        re_match = re.search('\d{1,2}-\d{1,2}, \d{4}', text)
        if re_match:
            # These are always date argued, thus continue.
            continue

        # Sometimes there's a string like: "Review Denied July 26, 2006. Skip this.
        if 'denied' in text.lower():
            continue

        try:
            if range_dates:
                found = parse_dates.parse_dates(text,
                                                sane_start=start,
                                                sane_end=end)
            else:
                found = parse_dates.parse_dates(text, sane_end=now())
            if found:
                dates.extend(found)
        except UnicodeEncodeError:
            # If it has unicode is crashes dateutil's parser, but is unlikely to be the date.
            pass

    # Get the date from our SCOTUS date table
    scotus_dates_found = []
    if not dates and court == 'scotus':
        for citation in citations:
            try:
                # Scotus dates are in the form of a list, since a single citation can refer to several dates.
                found = scotus_dates["%s %s %s" %
                                     (citation.volume, citation.reporter,
                                      citation.page)]
                if len(found) == 1:
                    scotus_dates_found.extend(found)
            except KeyError:
                pass
        if len(scotus_dates_found) == 1:
            dates = scotus_dates_found

    if not dates:
        # Try to grab the year from the citations, if it's the same in all of them.
        years = set([citation.year for citation in citations if citation.year])
        if len(years) == 1:
            dates.append(datetime.datetime(list(years)[0], 1, 1))

    if not dates:
        try:
            dates = fixes[case_path]['dates']
        except KeyError:
            if 'input_dates' in DEBUG:
                #subprocess.Popen(['firefox', 'file://%s' % case_path], shell=False).communicate()
                print '  No date found for: file://%s' % case_path
                input_date = raw_input('  What should be here (YYYY-MM-DD)? ')
                add_fix(case_path, {
                    'dates':
                    [datetime.datetime.strptime(input_date, '%Y-%m-%d')]
                })
                dates = [datetime.datetime.strptime(input_date, '%Y-%m-%d')]
            if 'log_bad_dates' in DEBUG:
                # Write the failed case out to file.
                with open('missing_dates.txt', 'a') as out:
                    out.write('%s\n' % case_path)

    if dates:
        if 'date' in DEBUG:
            log_print("  Using date: %s of dates found: %s" %
                      (max(dates), dates))
        return max(dates)
    else:
        if 'date' in DEBUG:
            log_print("  No dates found")
        return []
示例#24
0
def disambiguate_reporters(citations):
    """Convert a list of citations to a list of unambiguous ones.

    Goal is to figure out:
     - citation.canonical_reporter
     - citation.lookup_index

    And there are a few things that can be ambiguous:
     - More than one variation.
     - More than one reporter for the key.
     - Could be an edition (or not)
     - All combinations of the above:
        - More than one variation.
        - More than one variation, with more than one reporter for the key.
        - More than one variation, with more than one reporter for the key,
          which is an edition.
        - More than one variation, which is an edition
        - ...

    For variants, we just need to sort out the canonical_reporter.

    If it's not possible to disambiguate the reporter, we simply have to drop
    it.
    """
    unambiguous_citations = []
    for citation in citations:
        # Non-variant items (P.R.R., A.2d, Wash., etc.)
        if REPORTERS.get(EDITIONS.get(citation.reporter)) is not None:
            citation.canonical_reporter = EDITIONS[citation.reporter]
            if len(REPORTERS[EDITIONS[citation.reporter]]) == 1:
                # Single reporter, easy-peasy.
                citation.lookup_index = 0
                unambiguous_citations.append(citation)
                continue
            else:
                # Multiple books under this key, but which is correct?
                if citation.year:
                    # attempt resolution by date
                    possible_citations = []
                    for i in range(0, len(REPORTERS[EDITIONS[citation.reporter]])):
                        if is_date_in_reporter(REPORTERS[EDITIONS[citation.reporter]][i]["editions"], citation.year):
                            possible_citations.append((citation.reporter, i))
                    if len(possible_citations) == 1:
                        # We were able to identify only one hit after filtering by year.
                        citation.reporter = possible_citations[0][0]
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue

        # Try doing a variation of an edition.
        elif VARIATIONS_ONLY.get(citation.reporter) is not None:
            if len(VARIATIONS_ONLY[citation.reporter]) == 1:
                # Only one variation -- great, use it.
                citation.canonical_reporter = EDITIONS[VARIATIONS_ONLY[citation.reporter][0]]
                cached_variation = citation.reporter
                citation.reporter = VARIATIONS_ONLY[citation.reporter][0]
                if len(REPORTERS[citation.canonical_reporter]) == 1:
                    # It's a single reporter under a misspelled key.
                    citation.lookup_index = 0
                    unambiguous_citations.append(citation)
                    continue
                else:
                    # Multiple reporters under a single misspelled key
                    # (e.g. Wn.2d --> Wash --> Va Reports, Wash or
                    #                          Washington Reports).
                    if citation.year:
                        # attempt resolution by date
                        possible_citations = []
                        for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                            if is_date_in_reporter(
                                REPORTERS[citation.canonical_reporter][i]["editions"], citation.year
                            ):
                                possible_citations.append((citation.reporter, i))
                        if len(possible_citations) == 1:
                            # We were able to identify only one hit after filtering by year.
                            citation.lookup_index = possible_citations[0][1]
                            unambiguous_citations.append(citation)
                            continue
                    # Attempt resolution by unique variation (e.g. Cr. can only be Cranch[0])
                    possible_citations = []
                    for i in range(0, len(REPORTERS[citation.canonical_reporter])):
                        for variation in REPORTERS[citation.canonical_reporter][i]["variations"].items():
                            if variation[0] == cached_variation:
                                possible_citations.append((variation[1], i))
                    if len(possible_citations) == 1:
                        # We were able to find a single match after filtering by variation.
                        citation.lookup_index = possible_citations[0][1]
                        unambiguous_citations.append(citation)
                        continue
            else:
                # Multiple variations, deal with them.
                possible_citations = []
                for reporter_key in VARIATIONS_ONLY[citation.reporter]:
                    for i in range(0, len(REPORTERS[EDITIONS[reporter_key]])):
                        # This inner loop works regardless of the number of reporters under the key.
                        if is_date_in_reporter(REPORTERS[EDITIONS[reporter_key]][i]["editions"], citation.year):
                            possible_citations.append((reporter_key, i))
                if len(possible_citations) == 1:
                    # We were able to identify only one hit after filtering by year.
                    citation.canonical_reporter = EDITIONS[possible_citations[0][0]]
                    citation.reporter = possible_citations[0][0]
                    citation.lookup_index = possible_citations[0][1]
                    unambiguous_citations.append(citation)
                    continue

    return unambiguous_citations