/
check_capitalization.py
executable file
·431 lines (348 loc) · 21.6 KB
/
check_capitalization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""check_capitalization.py, by Patrick Mooney; 9 December 2016
This is a script to find capitalized words in the middle of sentences that
are not proper nouns (or approved other capitalized words). It also makes some
attempt to detect other capitalization problems. It goes through a text,
sentence by sentence, asking the user whether each sentence should be
capitalized. If not, it converts them to lowercase. When it has finished, it
writes the modified text back to the same file, i.e. it modifies the input file
in-place. It is primarily intended to check the output of my poetry_to_prose.py
script and was originally developed in order to facilitate the processing of a
complete edition of Shakespeare for my automated text blog Ulysses Redux.
Usage:
./check_capitalization.py [options] -i FILE
Options:
-i FILE, --input FILE
Specify the file to check. You may only process one file at a time with
this script. If you do not specify a file, the script will ask you which
file you want it to process.
-l WORDLIST, --list WORDLIST
Specify an additional list of words that are allowed to be capitalized
without asking. If you add words to the list during a run of
check_capitalization.py, the program will offer to overwrite the original
file. This file is a simple text file containing one lowercase word per
line. Don't edit this file during a run of the program; if you do, your
changes will be overwritten when check_capitalization.py ends.
-h, --help
Print this help message, then quit.
This script requires that NLTK be installed, because it relies on NLTK for a
lot of the work it does. See http://www.nltk.org/.
The most recent version of this script is available at:
https://github.com/patrick-brian-mooney/python-personal-library/blob/master/check_capitalization.py
This program is copyright 2016-21 by Patrick Mooney; it is licensed under the
GPL v3 or, at your option, any later version. See the file LICENSE.md for a
copy of this license.
"""
import getopt
import pprint
import os
import string
import sys
import typing
from collections import OrderedDict
from pathlib import Path
import nltk # nltk.org
import file_utils # https://github.com/patrick-brian-mooney/python-personal-library/
import multi_choice_menu # Same source.
import text_handling as th # Same.
always_capitalize_sentence_beginnings = True # Usually, it's helpful to set this to True if NLTK is doing a good job of finding the beginnings of sentences.
always_capitalize_list_filename = Path('/python-library/always_capitalize_list') # Or leave empty not to use a global list.
apostrophe_words_filename = Path('/python-library/apostrophe_words') # File listing words allowed to begin with an apostrophe
default_filename = None # Fill this in with a filename to validate that file
the_lines = [][:]
always_capitalize_list, original_always_capitalize_list = [][:], [][:]
apostrophe_words, original_apostrophe_words = [][:], [][:]
allowed_capitalized_words = ("i", "i'll", # additional words that are allowed to be capitalized mid-sentence.
"i’ll", "i'd", # these need to be represented in lowercase so the comparison works!
"i’d", "i'm", "i’m", "i've", "i’ve")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
punc = ''.join(list(set(string.punctuation) - {"'"} | {'—‘“”'}))
text_source_file_changed = False # Have we made any changes to the source file this run?
def puncstrip(w: str) -> str:
"""Return a version of W, a string, that has no punctuation at the beginning or
end.
"""
return w.rstrip("'").strip(punc) # Only strip straight quotes off the end: at the beginning, they might be apostrophes.
def comparative_form(w: str) -> str:
"""A quick convenience function to just return a standardized form of a word for
the purpose of comparing words for equality. It's lowercase, strips out (most)
leading and trailing punctuation, and strips out some (but not necessarily all)
whitespace.
:param w: the word to take the comparative form of
:return: the comparative form of the word
"""
ret = puncstrip(puncstrip(w.strip()).strip()).casefold()
if th.begins_with_apostrophe(ret): # Next, normalize the apostrophe form.
if ret not in apostrophe_words: # If it's not allowed to begin with an apostrophe, return it without the apostrophe, on the assumption that that apostrophe is really an opening quote mark.
if len(ret) > 1: # Avoid degenerate cases, e.g. where W *is* an apostrophe
ret = ret[1:]
else:
ret = "’"
else:
ret = "’" + ret[1:] # Correct the apostrophe to an unambiguous form.
return ret
# That is to say: strip whitespace from both ends, then strip leading and
# trailing elements of string.punctuation except for the apostrophe, plus
# additional other stuff, then strip whitespace from both ends again, then
# strip the same set of punctuation, then casefold the result. Additionally,
# if the word begins with an apostrophe but doesn't appear in the global list
# APOSTROPHE_WORDS, then strip the leading apostrophe.
def reassemble_sentence(sentence_list: typing.List[typing.Tuple[str, str]]) -> str:
"""Given a tagged sentence -- a list of tuples of the form
[(word, POS), (word, POS) ... ] -- reassemble it into a string much like the
original sentence (though possibly with spacing altered).
"""
ret = '' # FIXME: check if type annotation is correct
for w, _ in sentence_list:
ret = "%s%s" % (ret, w) if w in punc else "%s %s" % (ret, w) # Add space, except before punctuation
return ret
def check_word_capitalization(tagged_sentence: typing.List[typing.Tuple[str, str]], # CHECK: is this correct?
word_number: int,
allow_always_correct: bool,
# The rest of these parameters are just in case we have to save while quitting in the
# middle of the run. They can be None; if either is, saving is not an option that's
# offered to the user.
the_lines: typing.Union[typing.List[str], None] = None,
working_filename: typing.Union[Path, None] = None,
) -> bool:
"""Give the user a choice of whether to correct the capitalization of word number
WORD_NUMBER in TAGGED_SENTENCE or not to correct the capitalization of that
word. The "tagged" in TAGGED_SENTENCE means "POS-tagged by NLTK."
If ALLOW_ALWAYS_CORRECT is True, the user is given the option to always
capitalize this word; otherwise, the user is not given this option.
"""
global text_source_file_changed
global always_capitalize_list, apostrophe_words
if working_filename:
assert isinstance(working_filename, Path)
the_word = tagged_sentence[word_number][0]
if comparative_form(the_word) in always_capitalize_list:
return True
else:
# First, reassemble the sentence, except capitalize the entire word whose capitalization is in question
context_sentence = ''
count = 0
for w, _ in tagged_sentence:
if count == word_number:
w = w.upper()
count += 1
context_sentence = "%s%s" % (context_sentence, w) if w in punc else "%s %s" % (context_sentence, w)
print()
verb = "is" if th.is_capitalized(the_word) else "is not"
question = 'POSSIBLE ERROR DETECTED: the word "%s" %s capitalized. Is this wrong?' % (comparative_form(the_word), verb)
th.print_indented(question, 2)
print()
th.print_indented('CONTEXT: %s\n' % context_sentence, 2)
the_menu = OrderedDict([])
the_menu['Y'] = ("Decapitalize" if th.is_capitalized(the_word) else "Capitalize") + " this word"
the_menu['N'] = 'Leave this word as-is'
if allow_always_correct:
the_menu['A'] = "Always capitalize this word"
if th.begins_with_apostrophe(the_word) and comparative_form(the_word).strip("’'") not in apostrophe_words:
the_menu['D'] = "Allow this word to begin with an apostrophe"
if the_word.strip().startswith("'") and comparative_form(the_word).strip("’'") not in apostrophe_words:
the_menu['C'] = "Correct initial apostrophe ( ' ) to opening quote ( ‘ )"
if the_lines and working_filename:
the_menu['Q'] = 'Quit, with option to save training data (but not modified text)'
choice = comparative_form(multi_choice_menu.menu_choice(the_menu, "What would you like to do?"))
if choice == 'a':
always_capitalize_list += [comparative_form(the_word)]
choice = "n" if th.is_capitalized(the_word) else "y"
elif choice == 'q': # FIXME: we should really be able to save the modified source text.
# The text file hasn't been fully reassembled yet, so we can't save it! Pass other parameters, though.
# This branch only available if THE_LINES and WORKING_FILENAME were specified as parameters.
save_files()
print('\nQuitting ...')
sys.exit(0)
elif choice == "d":
apostrophe_words += [the_word[0] + comparative_form(the_word).strip("’'") ] # Add the word to the list ...
return check_word_capitalization(tagged_sentence, word_number, allow_always_correct) # And check again.
elif choice == "c":
tagged_sentence[word_number] = ("‘" + the_word.strip().lstrip("'"), tagged_sentence[word_number][1])
return check_word_capitalization(tagged_sentence, word_number, allow_always_correct) # And check again.
ret = choice.lower() == 'y'
return ret
def correct_sentence_capitalization(s: str,
working_filename: typing.Union[Path, None] = None,
the_lines: typing.Union[typing.List[str], None] = None) -> str:
"""Return a corrected version of S, the sentence that was passed in.
This is where the real work actually happens.
"""
global text_source_file_changed
count = 0
tagged_sent = nltk.tag.pos_tag(s.split()) # This is now a list of tuples: [(word, POS), (word, POS) ...]
for word, pos in tagged_sent: # POS = "part of speech." Go through the list of tuples, word by word
count += 1 # In English language counting order, which word in the sentence is this?
# OK, let's check for various capitalization problems.
# First: check for problems that are independent of whether they occur in the first word of a sentence.
if comparative_form(word) in always_capitalize_list and not th.is_capitalized(word):
# Check: uncapitalized word we know should always be capitalized?
tagged_sent[count-1] = (th.capitalize(tagged_sent[count-1][0]), pos)
text_source_file_changed = True
# Next, check for problems related to the first word of a sentence.
if count == 1: # Beginning of sentence has special handling.
if not th.is_capitalized(word): # Check: should first word of sentence be capitalized?
if always_capitalize_sentence_beginnings or check_word_capitalization(tagged_sent, count-1):
# If we capitalize it, set the indicated item in the list of tuples to a tuple that capitalizes the
# word in question and maintains the POS tagging for further checking. The rather ugly expression
# below is of course necessary because tuples are immutable.
tagged_sent[count-1] = (th.capitalize(tagged_sent[count-1][0]), pos)
text_source_file_changed = True
# Now, check for problems that can happen only outside the first word of a sentence.
else: # Checks for words other than the first word of the sentence
# First: is there an unexplained capitalized word beyond the first word of the sentence?
# unused fragment, should it go back in?: and (pos.upper() not in ['NNP'])
# probably not: NLTK is detecting proper nouns in part based on capitalization.
if th.is_capitalized(word) and (comparative_form(word) not in allowed_capitalized_words):
# Capitalized, but not a proper noun?
if check_word_capitalization(tagged_sentence=tagged_sent, word_number=count-1,
allow_always_correct=True, the_lines=the_lines,
working_filename=working_filename):
tagged_sent[count-1] = (tagged_sent[count-1][0].lower(), pos)
text_source_file_changed = True
elif (not th.is_capitalized(word)) and (comparative_form(word) in always_capitalize_list):
tagged_sent[count-1] = (th.capitalize(tagged_sent[count-1][0]), pos)
text_source_file_changed = True
return reassemble_sentence(tagged_sent).strip()
save_data_menu = OrderedDict([
('Y', "Overwrite the old data"),
('N', 'Cancel and lose the changes')
])
def save_files(the_lines: typing.Union[typing.List[str], None] = None,
working_filename: typing.Union[Path, None] = None,
suppress_kvetching: bool = False) -> None:
"""Give the user the option (possibly) to save the modified-in-place verified text (stored
in global variable THE_LINES), plus, if modified, the list of words to always
skip.
If no changes were made, then the procedure well helpfully inform you of that,
unless SUPPRESS_KVETCHING is True.
"""
global apostrophe_words_filename, always_capitalize_list_filename # semi-constant module configuration params
global apostrophe_words, always_capitalize_list
global original_always_capitalize_list
global text_source_file_changed
if working_filename:
assert isinstance(working_filename, Path)
if apostrophe_words_filename:
assert isinstance(apostrophe_words_filename, Path)
if always_capitalize_list_filename:
assert isinstance(always_capitalize_list_filename, Path)
if text_source_file_changed:
if the_lines and working_filename:
choice = comparative_form(multi_choice_menu.menu_choice(save_data_menu, 'Overwrite file "%s" with modified text?' % working_filename.name))
if choice == 'y':
with working_filename.open('w') as f:
f.writelines(the_lines)
else:
if not suppress_kvetching:
print('No changes made in this file, moving on ...\n\n')
always_capitalize_list.sort() # FIXME! Is this happening when called from a module?
if always_capitalize_list != original_always_capitalize_list:
print('\n\n')
choice = comparative_form(multi_choice_menu.menu_choice(save_data_menu, 'List of always-capitalize words "%s" modified. Save new list?' %
always_capitalize_list_filename.name))
if choice == 'y':
always_capitalize_list_filename = always_capitalize_list_filename or file_utils.do_open_dialog()
with always_capitalize_list_filename.open('w') as f:
f.writelines(sorted(list(set([comparative_form(line) + '\n' for line in always_capitalize_list]))))
original_always_capitalize_list = always_capitalize_list
apostrophe_words.sort()
if apostrophe_words != original_apostrophe_words:
print('\n\n')
choice = comparative_form(multi_choice_menu.menu_choice(save_data_menu, 'List of begin-with-apostrophe words "%s" modified. Save new list?' %
apostrophe_words_filename.name))
if choice == 'y':
apostrophe_words_filename = apostrophe_words_filename or file_utils.do_open_dialog()
with apostrophe_words_filename.open('w') as f:
f.writelines(sorted(list(set(['’%s\n' % comparative_form(line).lstrip("’'") for line in apostrophe_words]))))
def print_usage(exit_code: int = 0) -> typing.NoReturn:
"""Print a usage message and exit. If a non-zero EXIT_CODE is specified, the OS
will understand that there is an error condition.
"""
print("\n\n" + __doc__)
sys.exit(exit_code)
def process_command_line() -> typing.Tuple[typing.Union[Path, None], typing.Union[Path, None]]:
"""Read the command-line options. Set global variables appropriately.
Returns a tuple: (path to opened file, path to always-capitalize list).
Either or both may be None if the command line does not contain these options;
defaults can be hardcoded above, beneath the docstring, but those constants are
not read or otherwise dealt with by this function.
"""
the_filename, the_always_capitalize_list_filename = None, None
opts = tuple([])
try:
opts, _ = getopt.getopt(sys.argv[1:], 'l:i:h', ['help', 'input='])
except getopt.GetoptError:
print_usage(2)
for opt, param in opts:
if opt in ('-h', '--help'):
print_usage()
elif opt in ('-i', '--input'):
the_filename = Path(param)
elif opt in ('-l', '--list'):
print('Using always-capitalize file "%s"' % (param))
the_always_capitalize_list_filename = Path(param)
else:
print('ERROR: unknown switch %s used. Exiting ...' % opt)
sys.exit(3)
return the_filename, the_always_capitalize_list_filename
def process_lines(lines: typing.List[str],
working_filename: Path) -> typing.List[str]:
"""Process LINES, a list of lines, correcting those that need correcting, and
returning the list.
"""
assert isinstance(working_filename, Path)
ret = [][:]
for which_line in [l.strip() for l in lines]: # Go through the text, paragraph by paragraph.
sentences = [][:] # Build a list of corrected sentences to
# re-assemble when done.
for sentence in tokenizer.tokenize(which_line): # Go through the paragraph, sentence by sentence.
sentence = correct_sentence_capitalization(sentence, working_filename=working_filename)
sentences.append(sentence)
ret.append(' '.join(sentences) + '\n')
return ret # Check: is annotation for return type correct?
def process_file(the_filename: Path) -> typing.List[str]:
"""Loads the specified file and verifies it, producing a list of verified lines.
It returns this list, which is a list of lines that SHOULD BE written back to
disk.
This routine DOES NOT SAVE the file back to disk; save_files() does that.
"""
assert isinstance(the_filename, Path)
print("Opening: %s ..." % the_filename.name, end=" ")
with the_filename.open('r') as f:
lines = f.readlines()
print("successfully read %d lines. Processing...\n" % len(lines))
ret = process_lines(lines, working_filename=the_filename)
return ret # Check: is function return type annotation
force_debugging = False # Do we want to force a controlled run instead of reading the command line?
if __name__ == "__main__":
if force_debugging:
opts = Path("""/home/patrick/Documents/programming/python_projects/LibidoMechanica/poetry_corpus/William Shakespeare: Sonnet 033"""), None
else:
opts = process_command_line()
working_filename, always_capitalize_list_filename = default_filename or opts[0], always_capitalize_list_filename or opts[1]
try: # If an auto-capitalize list was specified, load it
if always_capitalize_list_filename:
with always_capitalize_list_filename.open('r') as skipfile:
always_capitalize_list = sorted(list(set([comparative_form(line) for line in skipfile.readlines()])))
except Exception as errrr:
print("WARNING: unable to open always-capitalize file %s! The system said: %s" % (always_capitalize_list_filename, errrr))
print(" ... proceeding with empty list")
original_always_capitalize_list = always_capitalize_list.copy() # Make a shallow copy of whatever we start with.
try: # If words-with-apostrophes file was specified, load it
if apostrophe_words_filename:
with apostrophe_words_filename.open('r') as skipfile:
apostrophe_words = sorted(list(set([comparative_form(line).lstrip('’') for line in skipfile.readlines()])))
except Exception as errrr:
print("WARNING: unable to open apostrophe file %s! The system said: %s" % (apostrophe_words_filename, errrr))
print(" ... proceeding with empty list")
original_apostrophe_words = apostrophe_words.copy()
working_filename = working_filename or file_utils.do_open_dialog()
if not working_filename:
print("No file to process!")
sys.exit(0)
the_lines = process_file(working_filename)
print('\n\n\nEntire file processed.')
save_files(the_lines=the_lines, working_filename=working_filename) #FIXME: don't reference globals!
print("All done!\n\n")