forked from redshiftzero/text-extraction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_extract.py
executable file
·374 lines (316 loc) · 11.3 KB
/
text_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/env python
import os
import sys
import csv
import math
import warnings
import click
import dj_database_url
import django
import scrapelib
from django.contrib.postgres.search import SearchVector
from django.db import transaction
from django.db.models import Count
from extract.utils import jid_to_abbr, abbr_to_jid
from extract import get_extract_func, DoNotDownload, CONVERSION_FUNCTIONS
# disable SSL validation and ignore warnings
scraper = scrapelib.Scraper(verify=False)
scraper.user_agent = "Mozilla"
warnings.filterwarnings("ignore", module="urllib3")
MIMETYPES = {
"application/pdf": "pdf",
"text/html": "html",
"application/msword": "doc",
"application/rtf": "rtf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
}
def init_django():
from django.conf import settings
DATABASE_URL = os.environ.get("DATABASE_URL", "postgis://localhost/openstatesorg")
DATABASES = {"default": dj_database_url.parse(DATABASE_URL)}
settings.configure(
DATABASES=DATABASES, INSTALLED_APPS=("opencivicdata.core", "opencivicdata.legislative")
)
django.setup()
def download(version):
abbr = jid_to_abbr(version["jurisdiction_id"])
ext = MIMETYPES[version["media_type"]]
filename = f'raw/{abbr}/{version["session"]}-{version["identifier"]}-{version["note"]}.{ext}'
filename.replace("#", "__")
if not os.path.exists(filename):
try:
os.makedirs(os.path.dirname(filename))
except OSError:
pass
try:
_, resp = scraper.urlretrieve(version["url"], filename)
except Exception:
click.secho("could not fetch " + version["url"], fg="yellow")
return None, None
return filename, resp.content
else:
with open(filename, "rb") as f:
return filename, f.read()
def extract_to_file(filename, data, version):
try:
func = get_extract_func(version)
if func == DoNotDownload:
return DoNotDownload, 0
else:
text = func(data, version)
except Exception as e:
click.secho(f"exception processing {version['url']}: {e}", fg="red")
text = None
if not text:
return None, 0
text_filename = filename.replace("raw/", "text/") + ".txt"
try:
os.makedirs(os.path.dirname(text_filename))
except OSError:
pass
with open(text_filename, "w") as f:
f.write(text)
return text_filename, len(text)
def update_bill(bill):
from opencivicdata.legislative.models import SearchableBill
try:
latest_version = bill.versions.order_by("-date", "-note").prefetch_related("links")[0]
links = latest_version.links.all()
except IndexError:
links = []
# check if there's an old entry and we can use it
# if bill.searchable:
# if bill.searchable.version_id == latest_version.id and not bill.searchable.is_error:
# return # nothing to do
# bill.searchable.delete()
# iterate through versions until we extract some good text
is_error = True
raw_text = ""
link = None
for link in links:
metadata = {
"url": link.url,
"media_type": link.media_type,
"title": bill.title,
"jurisdiction_id": bill.legislative_session.jurisdiction_id,
}
func = get_extract_func(metadata)
if func == DoNotDownload:
continue
try:
data = scraper.get(link.url).content
except Exception:
continue
# TODO: clean up whitespace
try:
raw_text = func(data, metadata)
except Exception as e:
click.secho(f"exception processing {metadata['url']}: {e}", fg="red")
if raw_text:
is_error = False
break
sb = SearchableBill.objects.create(
bill=bill,
version_link=link,
all_titles=bill.title, # TODO: add other titles
raw_text=raw_text,
is_error=is_error,
search_vector="",
)
return sb.id
@click.group()
def cli():
pass
def _resample(state, n=50):
"""
Grab new versions for a state from the database.
"""
init_django()
from opencivicdata.legislative.models import BillVersion
versions = BillVersion.objects.filter(
bill__legislative_session__jurisdiction_id=abbr_to_jid(state)
).order_by("?")[:n]
count = 0
fieldnames = [
"id",
"session",
"identifier",
"title",
"jurisdiction_id",
"media_type",
"url",
"note",
]
with open(f"raw/{state}.csv", "w") as outf:
out = csv.DictWriter(outf, fieldnames=fieldnames)
out.writeheader()
for v in versions:
for link in v.links.all():
out.writerow(
{
"id": v.id,
"session": v.bill.legislative_session.identifier,
"jurisdiction_id": v.bill.legislative_session.jurisdiction_id,
"identifier": v.bill.identifier,
"title": v.bill.title,
"url": link.url,
"media_type": link.media_type,
"note": v.note,
}
)
count += 1
click.secho(f"wrote new sample csv with {count} records")
@cli.command(help="obtain a sample of bills to extract text from")
@click.argument("state")
@click.option("--resample/--no-resample", default=False)
@click.option("--quiet/--no-quiet", default=False)
def sample(state, resample, quiet):
if resample:
_resample(state)
count = missing = empty = skipped = 0
with open(f"raw/{state}.csv") as f:
for version in csv.DictReader(f):
count += 1
filename, data = download(version)
if not filename:
missing += 1
continue
text_filename, n_bytes = extract_to_file(filename, data, version)
if text_filename == DoNotDownload:
skipped += 1
elif not n_bytes:
empty += 1
if not quiet:
click.secho(f"{filename} => {text_filename} ({n_bytes} bytes)")
# decide and print result
status = "green"
if empty or missing: # arbitrary threshold for now
status = "red"
click.secho(
f"{state}: processed {count}, {skipped} skipped, {missing} missing, {empty} empty",
fg=status,
)
if status == "red":
return 1
return 0
@cli.command(help="run sample on all states, used for CI")
@click.pass_context
def test(ctx):
failures = 0
states = sorted(CONVERSION_FUNCTIONS.keys())
click.secho(f"testing {len(states)} states...", fg="white")
for state in states:
failures += ctx.invoke(sample, state=state, quiet=True)
sys.exit(failures)
@cli.command(help="print a status table showing the current condition of states")
def status():
init_django()
from opencivicdata.legislative.models import Bill
states = sorted(CONVERSION_FUNCTIONS.keys())
click.secho("state | bills | missing | errors ", fg="white")
click.secho("===================================", fg="white")
for state in states:
all_bills = Bill.objects.filter(legislative_session__jurisdiction_id=abbr_to_jid(state))
missing_search = all_bills.filter(searchable__isnull=True).count()
errors = all_bills.filter(searchable__is_error=True).count()
all_bills = all_bills.count()
errcolor = mscolor = "green"
if missing_search > 0:
missing_search = math.ceil(missing_search / all_bills * 100)
mscolor = "yellow"
if missing_search > 1:
mscolor = "red"
if errors > 0:
errcolor = "yellow"
errors = math.ceil(errors / all_bills * 100)
if errors > 5:
errcolor = "red"
click.echo(
f"{state:5} | {all_bills:7} | "
+ click.style(f"{missing_search:6}%", fg=mscolor)
+ " | "
+ click.style(f"{errors:6}%", fg=errcolor)
)
@cli.command(help="rebuild the search index objects for a given state")
@click.argument("state")
def reindex_state(state):
init_django()
from opencivicdata.legislative.models import SearchableBill
ids = list(
SearchableBill.objects.filter(
bill__legislative_session__jurisdiction_id=abbr_to_jid(state)
).values_list("id", flat=True)
)
print(f"reindexing {len(ids)} bills for state")
reindex(ids)
@cli.command(help="update the saved bill text in the database")
@click.argument("state")
@click.option("-n", default=None)
@click.option("--clear-errors/--no-clear-errors", default=False)
@click.option("--checkpoint", default=500)
def update(state, n, clear_errors, checkpoint):
init_django()
from opencivicdata.legislative.models import Bill, SearchableBill
# print status within checkpoints
status_num = checkpoint / 5
if state == "all":
all_bills = Bill.objects.all()
else:
all_bills = Bill.objects.filter(legislative_session__jurisdiction_id=abbr_to_jid(state))
if clear_errors:
if state == "all":
print("--clear-errors only works with specific states, not all")
return
errs = SearchableBill.objects.filter(bill__in=all_bills, is_error=True)
print(f"clearing {len(errs)} errors")
errs.delete()
missing_search = all_bills.filter(searchable__isnull=True)
if state == "all":
MAX_UPDATE = 500
aggregates = missing_search.values("legislative_session__jurisdiction__name").annotate(
count=Count("id")
)
bail = False
for agg in aggregates:
state_name = agg["legislative_session__jurisdiction__name"]
if agg["count"] > MAX_UPDATE:
click.secho(f"Too many bills to update for {state_name}: {agg['count']}", fg="red")
bail = True
if bail:
sys.exit(1)
print(f"{len(missing_search)} missing, updating")
else:
print(f"{state}: {len(all_bills)} bills, {len(missing_search)} without search results")
if n:
missing_search = missing_search[: int(n)]
else:
n = len(missing_search)
ids_to_update = []
updated_count = 0
# going to manage our own transactions here so we can save in chunks
transaction.set_autocommit(False)
for b in missing_search:
ids_to_update.append(update_bill(b))
updated_count += 1
if updated_count % status_num == 0:
print(f"{state}: updated {updated_count} out of {n}")
if updated_count % checkpoint == 0:
reindex(ids_to_update)
transaction.commit()
ids_to_update = []
# be sure to reindex final set
reindex(ids_to_update)
transaction.commit()
transaction.set_autocommit(True)
def reindex(ids_to_update):
from opencivicdata.legislative.models import SearchableBill
print(f"updating {len(ids_to_update)} search vectors")
res = SearchableBill.objects.filter(id__in=ids_to_update).update(
search_vector=(
SearchVector("all_titles", weight="A", config="english")
+ SearchVector("raw_text", weight="B", config="english")
)
)
print(f"updated {res}")
if __name__ == "__main__":
cli()