示例#1
0
    def test_all_required_keys_no_extra_keys(self):
        """Are all required keys present? Are there any keys present that
        shouldn't be?
        """
        required_fields = [
            'cite_type', 'editions', 'mlz_jurisdiction', 'name', 'variations'
        ]
        optional_fields = ['publisher', 'notes', 'href']
        all_fields = required_fields + optional_fields
        for reporter_abbv, reporter_list in REPORTERS.items():
            for reporter_data in reporter_list:

                # All required fields present?
                for required_field in required_fields:
                    try:
                        reporter_data[required_field]
                    except KeyError:
                        self.fail("Reporter '%s' lacks required field '%s'" %
                                  (reporter_abbv, required_field))

                # No extra fields?
                for k in reporter_data.keys():
                    self.assertIn(
                        k, all_fields,
                        "Reporter '%s' has an unknown field '%s'" %
                        (reporter_abbv, k))
示例#2
0
    def test_all_required_keys_no_extra_keys(self):
        """Are all required keys present? Are there any keys present that
        shouldn't be?
        """
        required_fields = ['cite_type', 'editions', 'mlz_jurisdiction', 'name',
                           'variations']
        optional_fields = ['publisher', 'notes', 'href']
        all_fields = required_fields + optional_fields
        for reporter_abbv, reporter_list in REPORTERS.items():
            for reporter_data in reporter_list:

                # All required fields present?
                for required_field in required_fields:
                    try:
                        reporter_data[required_field]
                    except KeyError:
                        self.fail("Reporter '%s' lacks required field '%s'" % (
                            reporter_abbv, required_field
                        ))

                # No extra fields?
                for k in reporter_data.keys():
                    self.assertIn(
                        k,
                        all_fields,
                        "Reporter '%s' has an unknown field '%s'" % (
                            reporter_abbv, k
                        )
                    )
示例#3
0
    def test_fields_tidy(self):
        """Do fields have any messiness?

        For example:
         - some punctuation is not allowed in some keys
         - spaces at beginning/end not allowed
        """

        def cleaner(s):
            return re.sub(r"[^ 0-9a-zA-Z.,\-'&()]", "", s.strip())

        msg = "Got bad punctuation in: %s"
        for reporter_abbv, reporter_list in REPORTERS.items():
            self.assertEqual(
                reporter_abbv, cleaner(reporter_abbv), msg=msg % reporter_abbv
            )
            for reporter_data in reporter_list:
                for k in reporter_data["editions"].keys():
                    self.assertEqual(cleaner(k), k, msg=msg % k)
                for k, v in reporter_data["variations"].items():
                    self.assertEqual(cleaner(k), k, msg=msg % k)
                    self.assertEqual(cleaner(v), v, msg=msg % v)

        for s in emit_strings(REPORTERS):
            self.assertEqual(
                s.strip(), s, msg="Fields needs whitespace stripped: '%s'" % s
            )
示例#4
0
 def test_all_reporters_have_valid_cite_type(self):
     """Do all reporters have valid cite_type values?"""
     for reporter_abbv, reporter_list in REPORTERS.items():
         for reporter_data in reporter_list:
             self.assertIn(
                 reporter_data['cite_type'],
                 VALID_CITE_TYPES,
                 "%s did not have a valid cite_type value" % reporter_abbv,
             )
示例#5
0
 def test_all_reporters_have_valid_cite_type(self):
     """Do all reporters have valid cite_type values?"""
     for reporter_abbv, reporter_list in REPORTERS.items():
         for reporter_data in reporter_list:
             self.assertIn(
                 reporter_data['cite_type'],
                 VALID_CITE_TYPES,
                 "%s did not have a valid cite_type value" % reporter_abbv,
             )
示例#6
0
 def test_any_keys_missing_editions(self):
     """Have we added any new reporters that lack a matching edition?"""
     for r_name, r_items in REPORTERS.items():
         # For each reporter
         for item in r_items:
             # and each book in each reporter
             self.assertIn(
                 r_name, item['editions'],
                 msg="Could not find edition for key: %s" % r_name
             )
示例#7
0
 def test_any_keys_missing_editions(self):
     """Have we added any new reporters that lack a matching edition?"""
     for r_name, r_items in REPORTERS.items():
         # For each reporter
         for item in r_items:
             # and each book in each reporter
             self.assertIn(r_name,
                           item['editions'],
                           msg="Could not find edition for key: %s" %
                           r_name)
示例#8
0
def make_csv():
    with open('reporters.csv', 'w') as f:
        out = csv.DictWriter(f, fieldnames=FIELDNAMES)
        out.writeheader()

        for cite, reporter_list in REPORTERS.items():
            print("Adding: %s" % cite)
            for reporter in reporter_list:
                d = make_editions_dict(reporter['editions'])
                d['citation'] = cite
                d['name'] = reporter['name']
                d['publisher'] = reporter.get('publisher', '')
                d['cite_type'] = reporter['cite_type']
                d['mlz_jurisdictions'] = ", ".join(reporter['mlz_jurisdiction'])
                d['variations'] = ", ".join(reporter['variations'].keys())
                d['href'] = reporter.get('href', '')
                d['notes'] = reporter.get('notes', '')

                out.writerow(d)
示例#9
0
    def test_all_required_keys_no_extra_keys(self):
        """Are all required keys present? Are there any keys present that
        shouldn't be?
        """
        required_fields = [
            "cite_type",
            "editions",
            "mlz_jurisdiction",
            "name",
            "variations",
        ]
        optional_fields = ["cite_format", "publisher", "notes", "href", "regexes", "examples"]
        all_fields = required_fields + optional_fields
        for reporter_abbv, reporter_list in REPORTERS.items():
            for reporter_data in reporter_list:

                # All required fields present?
                for required_field in required_fields:
                    try:
                        reporter_data[required_field]
                    except KeyError:
                        self.fail(
                            "Reporter '%s' lacks required field '%s'"
                            % (reporter_abbv, required_field)
                        )

                # No extra fields?
                for k in reporter_data.keys():
                    self.assertIn(
                        k,
                        all_fields,
                        "Reporter '%s' has an unknown field '%s'" % (reporter_abbv, k),
                    )

                # No empty string values?
                for k, v in reporter_data.items():
                    if isinstance(v, str):
                        self.assertTrue(
                            v != "",
                            msg="Field '%s' is empty in reporter '%s'"
                            % (k, reporter_abbv),
                        )
示例#10
0
def make_csv():
    with open("reporters.csv", "w") as f:
        out = csv.DictWriter(f, fieldnames=FIELDNAMES)
        out.writeheader()

        for cite, reporter_list in REPORTERS.items():
            print("Adding: %s" % cite)
            for reporter in reporter_list:
                d = make_editions_dict(reporter["editions"])
                d["citation"] = cite
                d["name"] = reporter["name"]
                d["publisher"] = reporter.get("publisher", "")
                d["cite_type"] = reporter["cite_type"]
                d["mlz_jurisdictions"] = ", ".join(
                    reporter["mlz_jurisdiction"])
                d["variations"] = ", ".join(reporter["variations"].keys())
                d["href"] = reporter.get("href", "")
                d["notes"] = reporter.get("notes", "")

                out.writerow(d)
示例#11
0
文件: make_csv.py 项目: limc/project
def make_csv():
    with open('reporters.csv', 'w') as f:
        out = csv.DictWriter(f, fieldnames=FIELDNAMES)
        out.writeheader()

        for cite, reporter_list in REPORTERS.items():
            print("Adding: %s" % cite)
            for reporter in reporter_list:
                d = make_editions_dict(reporter['editions'])
                d['citation'] = cite
                d['name'] = reporter['name']
                d['publisher'] = reporter.get('publisher', '')
                d['cite_type'] = reporter['cite_type']
                d['mlz_jurisdictions'] = ", ".join(
                    reporter['mlz_jurisdiction'])
                d['variations'] = ", ".join(reporter['variations'].keys())
                d['href'] = reporter.get('href', '')
                d['notes'] = reporter.get('notes', '')

                out.writerow(d)
示例#12
0
def _populate_reporter_extractors():
    """Populate EXTRACTORS and EDITIONS_LOOKUP."""

    # Set up regex replacement variables from reporters-db
    raw_regex_variables = deepcopy(RAW_REGEX_VARIABLES)
    raw_regex_variables["full_cite"][""] = "$volume $reporter,? $page"
    raw_regex_variables["page"][""] = rf"(?P<page>{PAGE_NUMBER_REGEX})"
    regex_variables = process_variables(raw_regex_variables)

    def _substitute_edition(template, *edition_names):
        """Helper to replace $edition in template with edition_names."""
        edition = "|".join(re.escape(e) for e in edition_names)
        return Template(template).safe_substitute(edition=edition)

    # Extractors step one: add an extractor for each reporter string

    # Build a lookup of regex -> edition.
    # Keys in this dict will be regular expressions to handle a
    # particular reporter string, like (simplified)
    # r"(?P<volume>\d+) (?P<reporter>U\.S\.) (?P<page>\d+)"
    editions_by_regex = defaultdict(
        # Values in this dict will be:
        lambda: {
            # Exact matches. If the regex is "\d+ U.S. \d+",
            # this will be [Edition("U.S.")]
            "editions": [],
            # Variants. If the regex matches "\d+ U. S. \d+",
            # this will be [Edition("U.S.")]
            "variations": [],
            # Strings a text must contain for this regex to match.
            # If the regex is "\d+ S.E. 2d \d+",
            # this will be {"S.E. 2d"}
            "strings": set(),
            # Whether this regex results in a short cite:
            "short": False,
        }
    )

    def _add_regex(
        kind: str,
        reporters: List[str],
        edition: Edition,
        regex: str,
    ):
        """Helper to generate citations for a reporter
        and insert into editions_by_regex."""
        for reporter in reporters:
            EDITIONS_LOOKUP[reporter].append(edition)
        editions_by_regex[regex][kind].append(edition)

        # add strings
        have_strings = re.escape(reporters[0]) in regex
        if have_strings:
            editions_by_regex[regex]["strings"].update(reporters)

        # add short cite
        short_cite_regex = short_cite_re(regex)
        if short_cite_regex != regex:
            editions_by_regex[short_cite_regex][kind].append(edition)
            editions_by_regex[short_cite_regex]["short"] = True
            if have_strings:
                editions_by_regex[short_cite_regex]["strings"].update(
                    reporters
                )

    def _add_regexes(
        regex_templates: List[str],
        edition_name: str,
        edition: Edition,
        variations: List[str],
    ):
        """Expand regex_templates and add to editions_by_regex."""
        for regex_template in regex_templates:
            regex_template = recursive_substitute(
                regex_template, regex_variables
            )
            regex = _substitute_edition(regex_template, edition_name)
            _add_regex("editions", [edition_name], edition, regex)
            if variations:
                regex = _substitute_edition(regex_template, *variations)
                _add_regex(
                    "variations",
                    variations,
                    edition,
                    regex,
                )

    # add reporters.json:
    for source_key, source_cluster in REPORTERS.items():
        for source in source_cluster:
            reporter_obj = Reporter(
                short_name=source_key,
                name=source["name"],
                cite_type=source["cite_type"],
                source="reporters",
            )
            variations = source["variations"]
            for edition_name, edition_data in source["editions"].items():
                edition = Edition(
                    short_name=edition_name,
                    reporter=reporter_obj,
                    start=edition_data["start"],
                    end=edition_data["end"],
                )
                regex_templates = edition_data.get("regexes") or ["$full_cite"]
                edition_variations = [
                    k for k, v in variations.items() if v == edition_name
                ]
                _add_regexes(
                    regex_templates, edition_name, edition, edition_variations
                )

    # add laws.json
    for source_key, source_cluster in LAWS.items():
        for source in source_cluster:
            reporter_obj = Reporter(
                short_name=source_key,
                name=source["name"],
                cite_type=source["cite_type"],
                source="laws",
            )
            edition = Edition(
                short_name=source_key,
                reporter=reporter_obj,
                start=source["start"],
                end=source["end"],
            )
            regex_templates = source.get("regexes") or ["$full_cite"]
            # handle citation to multiple sections, like
            # "Mass. Gen. Laws ch. 1, §§ 2-3":
            regex_templates = [
                r.replace(r"§ ", r"§§? ?") for r in regex_templates
            ]
            _add_regexes(
                regex_templates,
                source_key,
                edition,
                source.get("variations", []),
            )

    # add journals.json
    for source_key, source_cluster in JOURNALS.items():
        for source in source_cluster:
            reporter_obj = Reporter(
                short_name=source_key,
                name=source["name"],
                cite_type=source["cite_type"],
                source="journals",
            )
            edition = Edition(
                short_name=source_key,
                reporter=reporter_obj,
                start=source["start"],
                end=source["end"],
            )
            regex_templates = source.get("regexes") or ["$full_cite"]
            _add_regexes(
                regex_templates,
                source_key,
                edition,
                source.get("variations", []),
            )

    # Add each regex to EXTRACTORS:
    for regex, cluster in editions_by_regex.items():
        EXTRACTORS.append(
            TokenExtractor(
                nonalphanum_boundaries_re(regex),
                CitationToken.from_match,
                extra={
                    "exact_editions": cluster["editions"],
                    "variation_editions": cluster["variations"],
                    "short": cluster["short"],
                },
                strings=list(cluster["strings"]),
            )
        )

    # Extractors step two:
    # Add a few one-off extractors to handle special token types
    # other than citations:

    EXTRACTORS.extend(
        [
            # Id.
            TokenExtractor(
                ID_REGEX,
                IdToken.from_match,
                flags=re.I,
                strings=["id.", "ibid."],
            ),
            # supra
            TokenExtractor(
                SUPRA_REGEX,
                SupraToken.from_match,
                flags=re.I,
                strings=["supra"],
            ),
            # paragraph
            TokenExtractor(
                PARAGRAPH_REGEX,
                ParagraphToken.from_match,
            ),
            # case name stopwords
            TokenExtractor(
                STOP_WORD_REGEX,
                StopWordToken.from_match,
                flags=re.I,
                strings=STOP_WORDS,
            ),
            # tokens containing section symbols
            TokenExtractor(
                SECTION_REGEX, SectionToken.from_match, strings=["§"]
            ),
        ]
    )
示例#13
0
def iter_reporters():
    for reporter_abbv, reporter_list in REPORTERS.items():
        for reporter_data in reporter_list:
            yield reporter_abbv, reporter_list, reporter_data