forked from freelawproject/reporters-db
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tests.py
343 lines (308 loc) · 12.5 KB
/
tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
import json
import os
import re
import datetime
from difflib import context_diff
from pathlib import Path
import six
from reporters_db import (
REPORTERS,
VARIATIONS_ONLY,
EDITIONS,
NAMES_TO_EDITIONS,
REGEX_VARIABLES,
)
from unittest import TestCase
from reporters_db.utils import substitute_editions, recursive_substitute
VALID_CITE_TYPES = (
"federal",
"neutral",
"scotus_early",
"specialty",
"specialty_west",
"specialty_lexis",
"state",
"state_regional",
)
def emit_strings(obj):
"""Recursively get all the strings out of a JSON object.
Convert ints to strs
"""
if isinstance(obj, dict):
# Feed the keys and items back into the function.
for k, v in obj.items():
for x in emit_strings(k):
yield x
for x in emit_strings(v):
yield x
elif isinstance(obj, list):
for item in obj:
for x in emit_strings(item):
yield x
elif isinstance(obj, int):
yield str(int)
elif isinstance(obj, six.text_type):
yield obj
def iter_reporters():
for reporter_abbv, reporter_list in REPORTERS.items():
for reporter_data in reporter_list:
yield reporter_abbv, reporter_list, reporter_data
def iter_editions():
for reporter_abbv, reporter_list, reporter_data in iter_reporters():
for edition_abbv, edition in reporter_data["editions"].items():
yield edition_abbv, edition
class ConstantsTest(TestCase):
def test_any_keys_missing_editions(self):
"""Have we added any new reporters that lack a matching edition?"""
for reporter_abbv, reporter_list, reporter_data in iter_reporters():
self.assertIn(
reporter_abbv,
reporter_data["editions"],
msg="Could not find edition for key: %s" % reporter_abbv,
)
def test_for_variations_mapping_to_bad_keys(self):
"""Do we have a variation that maps to a key that doesn't exist in the
first place?
"""
for variations in VARIATIONS_ONLY.values():
for variation in variations:
self.assertIn(
EDITIONS[variation],
REPORTERS.keys(),
msg="Could not map variation to a valid reporter: %s"
% variation,
)
def test_basic_names_to_editions(self):
"""Do we get something like we expected in the NAME_TO_EDITION var?"""
self.assertEqual(
["A.", "A.2d", "A.3d"], NAMES_TO_EDITIONS["Atlantic Reporter"]
)
def test_editions_ordering(self):
"""Test Ill. App., where we don't have good start dates."""
self.assertEqual(
["Ill. App.", "Ill. App. 2d", "Ill. App. 3d"],
NAMES_TO_EDITIONS["Illinois Appellate Court Reports"],
)
def test_that_all_dates_are_converted_to_dates_not_strings(self):
"""Do we properly make the ISO-8601 date strings into Python dates?"""
# for reporter_abbv, reporter_list, reporter_data in iter_reporters():
for e_name, e_dates in iter_editions():
# e_name == "A. 2d"
# e_dates == {
# "end": "1938-12-31T00:00:00",
# "start": "1885-01-01T00:00:00"
# }
for key in ["start", "end"]:
is_date_or_none = (
isinstance(e_dates[key], datetime.datetime)
or e_dates[key] is None
)
self.assertTrue(
is_date_or_none,
msg=(
"%s dates in the reporter '%s' appear to be "
"coming through as '%s'"
% (key, e_name, type(e_dates[key]))
),
)
if key == "start":
start_is_not_none = e_dates[key] is not None
self.assertTrue(
start_is_not_none,
msg=(
"Start date in reporter '%s' appears to "
"be None, not 1750" % e_name
),
)
def test_all_reporters_have_valid_cite_type(self):
"""Do all reporters have valid cite_type values?"""
for reporter_abbv, reporter_list, reporter_data in iter_reporters():
self.assertIn(
reporter_data["cite_type"],
VALID_CITE_TYPES,
"%s did not have a valid cite_type value" % reporter_abbv,
)
def test_all_required_keys_no_extra_keys(self):
"""Are all required keys present? Are there any keys present that
shouldn't be?
"""
required_fields = [
"cite_type",
"editions",
"mlz_jurisdiction",
"name",
"variations",
]
optional_fields = [
"cite_format",
"publisher",
"notes",
"href",
"regexes",
"examples",
]
all_fields = required_fields + optional_fields
for reporter_abbv, reporter_list, reporter_data in iter_reporters():
# All required fields present?
for required_field in required_fields:
try:
reporter_data[required_field]
except KeyError:
self.fail(
"Reporter '%s' lacks required field '%s'"
% (reporter_abbv, required_field)
)
# No extra fields?
for k in reporter_data.keys():
self.assertIn(
k,
all_fields,
"Reporter '%s' has an unknown field '%s'"
% (reporter_abbv, k),
)
# No empty string values?
for k, v in reporter_data.items():
if isinstance(v, str):
self.assertTrue(
v != "",
msg="Field '%s' is empty in reporter '%s'"
% (k, reporter_abbv),
)
def test_no_variation_is_same_as_key(self):
"""Are any variations identical to the keys they're supposed to be
variations of?
"""
for variation, keys in VARIATIONS_ONLY.items():
for key in keys:
self.assertNotEqual(
variation,
key,
"The variation '%s' is identical to the key it's supposed "
"to be a variation of." % variation,
)
def test_fields_tidy(self):
"""Do fields have any messiness?
For example:
- some punctuation is not allowed in some keys
- spaces at beginning/end not allowed
"""
def cleaner(s):
return re.sub(r"[^ 0-9a-zA-Z.,\-'&()\[\]]", "", s.strip())
msg = "Got bad punctuation in: %s"
for reporter_abbv, reporter_list, reporter_data in iter_reporters():
self.assertEqual(
reporter_abbv, cleaner(reporter_abbv), msg=msg % reporter_abbv
)
for k in reporter_data["editions"].keys():
self.assertEqual(cleaner(k), k, msg=msg % k)
for k, v in reporter_data["variations"].items():
self.assertEqual(cleaner(k), k, msg=msg % k)
self.assertEqual(cleaner(v), v, msg=msg % v)
for s in emit_strings(REPORTERS):
self.assertEqual(
s.strip(), s, msg="Fields needs whitespace stripped: '%s'" % s
)
def test_nothing_ends_before_it_starts(self):
"""Do any editions have end dates before their start dates?"""
for k, edition in iter_editions():
if edition["start"] and edition["end"]:
self.assertLessEqual(
edition["start"],
edition["end"],
msg="It appears that edition %s ends before it "
"starts." % k,
)
def test_json_format(self):
"""Does format of reporters.json match json.dumps(json.loads(), sort_keys=True)? """
for file_name in ("reporters.json", "regexes.json"):
with self.subTest(file_name=file_name):
json_path = (
Path(__file__).parent / "reporters_db" / "data" / file_name
)
json_str = json_path.read_text()
reformatted = json.dumps(
json.loads(json_str),
indent=4,
ensure_ascii=False,
sort_keys=True,
)
reformatted += "\n"
if json_str != reformatted:
if os.environ.get("FIX_JSON"):
json_path.write_text(reformatted)
else:
diff = context_diff(
json_str.splitlines(),
reformatted.splitlines(),
fromfile="reporters.json",
tofile="expected.json",
)
self.fail(
("%s needs reformatting. " % file_name)
+ "Run with env var FIX_JSON=1 to update the file automatically. "
+ "Diff of actual vs. expected:\n"
+ "\n".join(diff)
)
def test_regexes(self):
"""Do custom regexes and examples match up?"""
for reporter_abbv, reporter_list, reporter_data in iter_reporters():
examples = reporter_data.get("examples", [])
matched_examples = set()
custom_regexes = {}
# check that each custom regex matches at least one example
for edition_abbv, edition in reporter_data["editions"].items():
if not edition.get("regexes"):
continue
with self.subTest(
"Check edition regexes", edition=edition_abbv
):
for edition_regex in edition["regexes"]:
full_regex = recursive_substitute(
edition_regex, REGEX_VARIABLES
)
regexes = substitute_editions(
full_regex,
edition_abbv,
reporter_data["variations"],
)
custom_regexes[edition_regex] = regexes
has_match = False
for example in examples:
for regex in regexes:
if re.match(regex + "$", example):
has_match = True
matched_examples.add(example)
break
if not has_match:
try:
import exrex
candidate = "Possible examples: %s" % [
exrex.getone(regexes[0], limit=3)
for _ in range(10)
]
except ImportError:
candidate = "Run 'pip install exrex' to generate a candidate example"
self.fail(
"Reporter '%s' has no match in 'examples' for custom regex '%s'.\nExpanded regexes: %s.\n%s"
% (
reporter_abbv,
edition_regex,
regexes,
candidate,
)
)
# check that each example is matched by at least one regex
if custom_regexes:
with self.subTest(
"Check all examples matched by custom regex",
reporter=reporter_abbv,
):
self.assertEqual(
set(examples),
matched_examples,
"Not all examples matched. If custom regexes are provided, all examples should match. Regexes tried: %s"
% custom_regexes,
)
if __name__ == "__main__":
import unittest
unittest.main()