-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocalize.py
204 lines (185 loc) · 8.19 KB
/
vocalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import blib, pywikibot
from blib import msg, getparam, addparam
import arabiclib
import ar_translit
# Vocalize ARABIC based on LATIN. Return vocalized Arabic text if
# vocalization succeeds and is different from the existing Arabic text,
# else False. TEMPLATE is the template being processed and PARAM is the
# name of the parameter in this template being vocalized; both are used
# only in status messages.
def do_vocalize_param(pagetitle, index, template, param, arabic, latin):
def pagemsg(text):
msg("Page %s %s: %s.%s: %s" % (index, pagetitle, template.name, param,
text))
try:
vocalized, _ = ar_translit.tr_matching(arabic, latin, True, pagemsg)
except Exception as e:
pagemsg("Trying to vocalize %s (%s): %s" % (arabic, latin, e))
vocalized = None
if vocalized:
if vocalized == arabic:
pagemsg("No change in %s (Latin %s)" % (arabic, latin))
else:
pagemsg("Would replace %s with vocalized %s (Latin %s)" % (
arabic, vocalized, latin))
return vocalized
else:
pagemsg("Unable to vocalize %s (Latin %s)" % (arabic, latin))
return False
# Attempt to vocalize parameter PARAM based on corresponding transliteration
# parameter PARAMTR. If PARAM not found, return False. Else, return the
# vocalized Arabic if different from unvocalized, else return True.
def vocalize_param(pagetitle, index, template, param, paramtr):
arabic = getparam(template, param)
latin = getparam(template, paramtr)
if not arabic:
return False
if latin:
vocalized = do_vocalize_param(pagetitle, index, template, param, arabic, latin)
if vocalized:
oldtempl = "%s" % str(template)
addparam(template, param, vocalized)
msg("Page %s %s: Replaced %s with %s" % (index, pagetitle,
oldtempl, str(template)))
return vocalized
return True
# Vocalize the parameter chain for PARAM in TEMPLATE. For example, if PARAM
# is "pl" then this will attempt to vocalize "pl", "pl2", "pl3", etc. based on
# "pltr", "pl2tr", "pl3tr", etc., stopping when "plN" isn't found. Return
# list of changed parameters, for use in the changelog message.
def vocalize_param_chain(pagetitle, index, template, param):
paramschanged = []
result = vocalize_param(pagetitle, index, template, param, param + "tr")
if isinstance(result, str):
paramschanged.append(param)
i = 2
while result:
thisparam = param + str(i)
result = vocalize_param(pagetitle, index, template, thisparam, thisparam + "tr")
if isinstance(result, str):
paramschanged.append(thisparam)
i += 1
return paramschanged
# Vocalize the head param(s) for the given headword template on the given page.
# Modifies the templates in place. Return list of changed parameters, for
# use in the changelog message.
def vocalize_head(pagetitle, index, template):
paramschanged = []
#pagetitle = str(page.title(withNamespace=False))
# Handle existing 1= and head from page title
if template.has("tr"):
# Check for multiple transliterations of head or 1. If so, split on
# the multiple transliterations, with separate vocalized heads.
latin = getparam(template, "tr")
if "," in latin:
trs = re.split(",\\s*", latin)
# Find the first alternate head (head2, head3, ...) not already present
i = 2
while template.has("head" + str(i)):
i += 1
addparam(template, "tr", trs[0])
if template.has("1"):
head = getparam(template, "1")
# for new heads, only use existing head in 1= if ends with -un (tanwīn),
# because many of the existing 1= values are vocalized according to the
# first transliterated entry in the list and won't work with the others
if not head.endswith("\u064C"):
head = pagetitle
else:
head = pagetitle
for tr in trs[1:]:
addparam(template, "head" + str(i), head)
addparam(template, "tr" + str(i), tr)
i += 1
paramschanged.append("split translit into multiple heads")
# Try to vocalize 1=
result = vocalize_param(pagetitle, index, template, "1", "tr")
if isinstance(result, str):
paramschanged.append("1")
# If 1= not found, try vocalizing the page title and make it the 1= value
if not result:
arabic = str(pagetitle)
latin = getparam(template, "tr")
if arabic and latin:
vocalized = do_vocalize_param(pagetitle, index, template, "page title",
arabic, latin)
if vocalized:
oldtempl = "%s" % str(template)
if template.has("2"):
addparam(template, "1", vocalized, before="2")
else:
addparam(template, "1", vocalized, before="tr")
paramschanged.append("1")
msg("Page %s %s: Replaced %s with %s" % (index, pagetitle,
oldtempl, str(template)))
# Check and try to vocalize extra heads
i = 2
result = True
while result:
thisparam = "head" + str(i)
result = vocalize_param(pagetitle, index, template, thisparam, "tr" + str(i))
if isinstance(result, str):
paramschanged.append(thisparam)
i += 1
return paramschanged
# Vocalize the headword templates on the given page with the given text.
# Returns the changed text along with a changelog message.
def vocalize_one_page_headwords(pagetitle, index, text):
actions_taken = []
for template in text.filter_templates():
paramschanged = []
if template.name in arabiclib.arabic_non_verbal_headword_templates:
paramschanged += vocalize_head(pagetitle, index, template)
for param in ["pl", "plobl", "cpl", "cplobl", "fpl", "fplobl", "f",
"fobl", "m", "mobl", "obl", "el", "sing", "coll", "d", "dobl",
"pauc", "cons"]:
paramschanged += vocalize_param_chain(pagetitle, index, template, param)
if len(paramschanged) > 0:
if template.has("tr"):
tempname = "%s %s" % (template.name, getparam(template, "tr"))
else:
tempname = template.name
actions_taken.append("%s (%s)" % (', '.join(paramschanged), tempname))
changelog = "vocalize parameters: %s" % '; '.join(actions_taken)
#if len(actions_taken) > 0:
msg("Page %s %s: Change log = %s" % (index, pagetitle, changelog))
return text, changelog
# Vocalize headword templates on pages from STARTFROM to (but not including)
# UPTO, either page names or 0-based integers. Save changes if SAVE is true.
# Show exact changes if VERBOSE is true.
def vocalize_headwords(save, verbose, startFrom, upTo):
def process_page(page, index, text):
return vocalize_one_page_headwords(str(page.title()), index, text)
#for page in blib.references("Template:tracking/ar-head/head", startFrom, upTo):
#for page in blib.references("Template:ar-nisba", startFrom, upTo):
for cat in ["Arabic lemmas", "Arabic non-lemma forms"]:
for index, page in blib.cat_articles(cat, startFrom, upTo):
blib.do_edit(page, index, process_page, save=save, verbose=verbose)
# Vocalize link-like templates on pages from STARTFROM to (but not including)
# UPTO, either page names or 0-based integers. Save changes if SAVE is true.
# Show exact changes if VERBOSE is true. CATTYPE should be 'vocab', 'borrowed'
# or 'translation', indicating which categories to examine.
def vocalize_links(save, verbose, cattype, startFrom, upTo):
def process_param(pagetitle, index, pagetext, template, tlang, param, paramtr):
result = vocalize_param(pagetitle, index, template, param, paramtr)
if isinstance(result, str):
result = ["%s (%s)" % (result, template.name)]
return result
def join_actions(actions):
return "vocalize links: %s" % '; '.join(actions)
return blib.process_links(save, verbose, "ar", "Arabic", cattype,
startFrom, upTo, process_param, join_actions)
pa = blib.create_argparser("Correct vocalization and translit")
pa.add_argument("-l", "--links", action='store_true',
help="Vocalize links")
pa.add_argument("--cattype", default="borrowed",
help="Categories to examine ('vocab', 'borrowed', 'translation')")
params = pa.parse_args()
startFrom, upTo = blib.parse_start_end(params.start, params.end)
if params.links:
vocalize_links(params.save, params.verbose, params.cattype, startFrom, upTo)
else:
vocalize_headwords(params.save, params.verbose, startFrom, upTo)