/
rulib.py
742 lines (666 loc) · 30.7 KB
/
rulib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import unicodedata
import blib
from collections import OrderedDict
AC = "\u0301" # acute = ́
GR = "\u0300" # grave = ̀
CFLEX = "\u0302" # circumflex = ̂
DOTABOVE = "\u0307" # dot above = ̇
DOTBELOW = "\u0323" # dot below = ̣
DI = "\u0308" # diaeresis = ̈
DUBGR = "\u030F" # double grave = ̏
CARON = "\u030C" # caron = ̌
PSEUDOVOWEL = "\uFFF1" # pseudovowel placeholder
PSEUDOCONS = "\uFFF2" # pseudoconsonant placeholder
# non-primary accents (i.e. excluding acute) that indicate pronunciation
# (not counting diaeresis, which indicates a completely different vowel,
# and caron, which is used in translit as ě to indicate the yat vowel)
non_primary_pron_accents = GR + CFLEX + DOTABOVE + DOTBELOW + DUBGR
# accents that indicate pronunciation (not counting diaresis, which indicates
# a completely different vowel)
pron_accents = AC + non_primary_pron_accents
# all accents
accents = pron_accents + DI + CARON
# accents indicating stress (primary or otherwise)
stress_accents = AC + GR + CFLEX + DI + DUBGR
# regex for any optional accent(s)
opt_accent = "[" + accents + "]*"
composed_grave_vowel = "ѐЀѝЍ"
vowel_no_jo = "аеиоуяэыюіѣѵүАЕИОУЯЭЫЮІѢѴҮ" + composed_grave_vowel #omit ёЁ
vowel = vowel_no_jo + "ёЁ"
cons_except_sib_c = "бдфгйклмнпрствхзьъБДФГЙКЛМНПРСТВХЗЬЪ"
sib = "шщчжШЩЧЖ"
sib_c = sib + "цЦ"
cons = cons_except_sib_c + sib_c
velar = "кгхКГХ"
uppercase = "АЕИОУЯЭЫЁЮІѢѴБДФГЙКЛМНПРСТВХЗЬЪШЩЧЖЦ"
tr_vowel = "aeěɛiouyAEĚƐIOUY"
# any consonant in transliteration, omitting soft/hard sign
tr_cons_no_sign = "bcčdfghjklmnpqrsštvwxzžBCČDFGHJKLMNPQRSŠTVWXZŽ" + PSEUDOCONS
# any consonant in transliteration, including soft/hard sign
tr_cons = tr_cons_no_sign + "ʹʺ"
# regex for any consonant in transliteration, including soft/hard sign,
# optionally followed by any accent
tr_cons_acc_re = "[" + tr_cons + "]" + opt_accent
def decompose_acute_grave(text):
# Decompose sequences of character + acute or grave, but compose all other
# accented sequences, e.g. Latin č and ě, Cyrillic ё and й.
# (1) Decompose entirely.
decomposed = unicodedata.normalize("NFD", str(text))
# (2) Split into text sections separated by acutes and graves.
split = re.split("([%s%s])" % (AC, GR), decomposed)
# (3) Recompose each section.
recomposed = [unicodedata.normalize("NFC", part) for part in split]
# (4) Paste sections together.
return "".join(recomposed)
def decompose(text):
return decompose_acute_grave(text)
def recompose(text):
return unicodedata.normalize("NFC", text)
def assert_decomposed(text):
assert not re.search("[áéíóúýàèìòùỳäëïöüÿÁÉÍÓÚÝÀÈÌÒÙỲÄËÏÖÜŸ]", text)
def xlit_text(text, pagemsg, verbose=False):
def expand_text(tempcall):
# The page name doesn't matter when we call {{xlit}}.
return blib.expand_text(tempcall, "foo bar", pagemsg, verbose)
return expand_text("{{xlit|ru|%s}}" % text)
# Does a phrase of connected text need accents? We need to split by word
# and check each one.
def needs_accents(text, split_dash=False):
# A word needs accents if it is unstressed and contains more than one vowel;
# but if split_dash, allow cases like динь-динь with multiple monosyllabic
# words separated by a hyphen. We don't just split on hyphens at top level
# otherwise a word like Али-Баба́ will "need accents".
def word_needs_accents(word):
if not is_unaccented(word):
return False
for sw in re.split(r"-", word) if split_dash else [word]:
if not is_monosyllabic(sw):
return True
return False
words = re.split(r"\s", text)
for word in words:
if word_needs_accents(word):
return True
return False
def is_stressed(word):
# A word that has ё in it is inherently stressed.
# diaeresis occurs in сѣ̈дла plural of сѣдло́
return re.search("[́̈ёЁ]", word)
def is_tr_stressed(word):
if not word:
return False
return re.search("[́̈]", unicodedata.normalize("NFD", word))
def is_unstressed(word):
return not is_stressed(word)
def is_tr_unstressed(word):
return not is_tr_stressed(word)
def is_unaccented(word):
return not re.search("[" + stress_accents + "ёЁѐЀѝЍ]", word)
def is_tr_unaccented(word):
return not re.search("[" + stress_accents + "]", unicodedata.normalize("NFD", word))
def is_ending_stressed(word):
return (re.search("[ёЁ][^" + vowel + "]*$", word) or
re.search("[" + vowel + "][́̈][^" + vowel + "]*$", word))
# True if any word in text has two or more stresses; don't count words like
# платёжеспосо́бность or трёхле́тний, where the first ё isn't accented
def is_multi_stressed(text):
text = re.sub("[ёЁ]", "е" + DI, text)
words = re.split(r"[\s-]", text)
for word in words:
# Look for true accent (not diaeresis) + any another accent, in the
# same word
if re.search("[" + vowel + "][́].*[" + vowel + "][́̈]", word):
return True
return False
def number_of_accents(text):
return len(re.sub("[^" + accents + "ёЁѐЀѝЍ]", "", text))
def is_beginning_stressed(word):
return (re.search("^[^" + vowel + "]*[ёЁ]", word) or
re.search("^[^" + vowel + "]*[" + vowel + "]́", word))
def is_nonsyllabic(word):
return not re.search("[" + vowel + "]", word)
# Includes non-syllabic stems such as льд-
def is_monosyllabic(word):
vowel_or_hard_sign = vowel + "ъЪ" # in case we're called for Bulgarian
word = re.sub("ъ$", "", word)
return not re.search("[" + vowel_or_hard_sign + "].*[" + vowel_or_hard_sign + "]", word)
# Includes non-syllabic stems such as lʹd-
def is_tr_monosyllabic(word):
if not word:
return False
return not re.search("[" + tr_vowel + "].*[" + tr_vowel + "]",
unicodedata.normalize("NFD", word))
def ends_with_vowel(word):
return re.search("[" + vowel + "][" + AC + GR + DI + "]?$", word)
grave_deaccenter = {
GR:"", # grave accent
"ѐ":"е", # composed Cyrillic chars w/grave accent
"Ѐ":"Е",
"ѝ":"и",
"Ѝ":"И",
}
deaccenter = grave_deaccenter.copy()
deaccenter[AC] = "" # acute accent
deaccenter[DI] = "" # diaeresis
def remove_grave_accents(word):
# remove grave accents
return re.sub("([" + GR + "ѐЀѝЍ])", lambda m: grave_deaccenter[m.group(1)], word)
def remove_accents(word):
# remove pronunciation accents (not diaeresis)
return re.sub("([" + pron_accents + "ѐЀѝЍ])",
lambda m: deaccenter[m.group(1)], word)
def remove_tr_accents(word):
# remove pronunciation accents from translit (not diaeresis)
if not word:
return word
return unicodedata.normalize("NFC", re.sub("[" + pron_accents + "]", "",
unicodedata.normalize("NFD", word)))
def remove_monosyllabic_accents(word):
# note: This doesn't affect diaeresis (composed or uncomposed) because
# it indicates a change in vowel quality, which still applies to
# monosyllabic words.
if is_monosyllabic(word) and not word.startswith("-"):
return remove_accents(word)
return word
def remove_tr_monosyllabic_accents(word):
# note: This doesn't affect diaeresis (composed or uncomposed) because
# it indicates a change in vowel quality, which still applies to
# monosyllabic words.
if not word:
return word
if is_tr_monosyllabic(word) and not word.startswith("-"):
return remove_tr_accents(word)
return word
def remove_non_primary_accents(word):
# remove all pronunciation accents except acute
return re.sub("([" + non_primary_pron_accents + "ѐЀѝЍ])",
lambda m: deaccenter[m.group(1)], word)
def remove_tr_non_primary_accents(word):
# remove all pronunciation accents except acute from translit
if not word:
return word
return unicodedata.normalize("NFC", re.sub("[" + non_primary_pron_accents + "]", "",
unicodedata.normalize("NFD", word)))
# Subfunction of split_syllables(). On input we get sections of text
# consisting of CONSONANT - VOWEL - CONSONANT - VOWEL ... - CONSONANT,
# where CONSONANT consists of zero or more consonants and VOWEL consists
# of exactly one vowel plus any following accent(s); we combine these into
# syllables as required by split_syllables().
def combine_captures(captures):
if len(captures) == 1:
return captures
combined = []
for i in range(0, len(captures) - 1, 2):
combined.append(captures[i] + captures[i+1])
combined[-1] = combined[-1] + captures[-1]
return combined
# Split Russian text and transliteration into syllables. Syllables end with
# vowel + accent(s), except for the last syllable, which includes any
# trailing consonants.
# NOTE: Translit must already be decomposed! See comment at top.
def split_syllables(ru, tr):
# Split into alternating consonant/vowel sequences, as described in
# combine_captures().
rusyllables = combine_captures(re.split("([" + vowel + "]" + opt_accent + ")", ru))
trsyllables = None
if tr:
assert_decomposed(tr)
trsyllables = combine_captures(re.split("([" + tr_vowel + "]" + opt_accent + ")", tr))
if len(rusyllables) != len(trsyllables):
raise ValueError("Russian " + ru + " doesn't have same number of syllables as translit " + tr)
# msg("/".join(rusyllables) + "(" + str(len(rusyllables)) + (trsyllables and (") || " + "/".join(trsyllables) + "(" + str(len(trsyllables)) + ")") or ""))
return rusyllables, trsyllables
# Split Russian word and transliteration into hyphen-separated components.
# Rejoining with "-".join(...) will recover the original word.
# If the original word ends in a hyphen, that hyphen gets included with the
# preceding component (this is the only case when an individual component has
# a hyphen in it).
def split_hyphens(ru, tr):
rucomponents = ru.split("-")
if rucomponents[-1] == "" and len(rucomponents) > 1:
rucomponents[-2] = rucomponents[-2] + "-"
del rucomponents[-1]
trcomponents = None
if tr:
trcomponents = tr.split("-")
if trcomponents[-1] == "" and len(trcomponents) > 1:
trcomponents[-2] = trcomponents[-2] + "-"
del trcomponents[-1]
if len(rucomponents) != len(trcomponents):
raise ValueError("Russian " + ru + " doesn't have same number of hyphenated components as translit " + tr)
return rucomponents, trcomponents
# Apply j correction, converting je to e after consonants, jo to o after
# a sibilant, ju to u after hard sibilant.
# NOTE: Translit must already be decomposed! See comment at top.
def j_correction(tr):
tr = re.sub("([" + tr_cons_no_sign + "]" + opt_accent + ")[Jj]([EeĚě])", r"\1\2", tr)
tr = re.sub("([žščŽŠČ])[Jj]([Oo])", r"\1\2", tr)
tr = re.sub("([žšŽŠ])[Jj]([Uu])", r"\1\2", tr)
return tr
destresser = deaccenter.copy()
destresser["ё"] = "е"
destresser["Ё"] = "Е"
def make_unstressed_ru(ru):
# The following regexp has grave+acute+diaeresis after the bracket
#
return re.sub("([̀́̈ёЁѐЀѝЍ])", lambda m: destresser[m.group(1)], ru)
# Remove all stress marks (acute, grave, diaeresis).
# NOTE: Translit must already be decomposed! See comment at top.
def make_unstressed(ru, tr=None):
if not tr:
return make_unstressed_ru(ru), None
# In the presence of TR, we need to do things the hard way: Splitting
# into syllables and only converting Latin o to e opposite a ё.
rusyl, trsyl = split_syllables(ru, tr)
for i in range(len(rusyl)):
if re.search("[ёЁ]", rusyl[i]):
trsyl[i] = trsyl[i].replace("o", "e").replace("O", "E")
rusyl[i] = make_unstressed_ru(rusyl[i])
# the following should still work as it will affect accents only
trsyl[i] = make_unstressed_ru(trsyl[i])
# Also need to apply j correction as otherwise we'll have je after cons, etc.
return "".join(rusyl), j_correction("".join(trsyl))
def remove_jo_ru(word):
return re.sub("([̈ёЁ])", destresser, word)
# Remove diaeresis stress marks only.
# NOTE: Translit must already be decomposed! See comment at top.
def remove_jo(ru, tr=None):
if not tr:
return remove_jo_ru(ru), None
# In the presence of TR, we need to do things the hard way: Splitting
# into syllables and only converting Latin o to e opposite a ё.
rusyl, trsyl = split_syllables(ru, tr)
for i in range(len(rusyl)):
if re.search("[ёЁ]", rusyl[i]):
trsyl[i] = trsyl[i].replace("o", "e").replace("O", "E")
rusyl[i] = remove_jo_ru(rusyl[i])
# the following should still work as it will affect accents only
trsyl[i] = make_unstressed_once_ru(trsyl[i])
# Also need to apply j correction as otherwise we'll have je after cons, etc.
return "".join(rusyl), j_correction("".join(trsyl))
def make_unstressed_once_ru(word):
# leave graves alone
return re.sub("([́̈ёЁ])([^́̈ёЁ]*)$", lambda m: destresser[m.group(1)] + m.group(2), word, 1)
def map_last_hyphenated_component(fn, ru, tr):
if "-" in ru:
# If there is a hyphen, do it the hard way by splitting into
# individual components and doing the last one. Otherwise we just do
# the whole string.
rucomponents, trcomponents = split_hyphens(ru, tr)
lastru, lasttr = fn(rucomponents[-1], trcomponents and trcomponents[-1] or None)
rucomponents[-1] = lastru
ru = "-".join(rucomponents)
if trcomponents:
trcomponents[-1] = lasttr
tr = "-".join(trcomponents)
return ru, tr
return fn(ru, tr)
# Make last stressed syllable (acute or diaeresis) unstressed; leave
# unstressed; leave graves alone; if NOCONCAT, return individual syllables.
# NOTE: Translit must already be decomposed! See comment at top.
def make_unstressed_once_after_hyphen_split(ru, tr=None, noconcat=False):
if not tr:
return make_unstressed_once_ru(ru), None
# In the presence of TR, we need to do things the hard way, as with
# make_unstressed().
rusyl, trsyl = split_syllables(ru, tr)
for i in range(len(rusyl) - 1, -1, -1):
stressed = is_stressed(rusyl[i])
if stressed:
if re.search("[ёЁ]", rusyl[i]):
trsyl[i] = trsyl[i].replace("o", "e").replace("O", "E")
rusyl[i] = make_unstressed_once_ru(rusyl[i])
# the following should still work as it will affect accents only
trsyl[i] = make_unstressed_once_ru(trsyl[i])
break
if noconcat:
return rusyl, trsyl
# Also need to apply j correction as otherwise we'll have je after cons
return "".join(rusyl), j_correction("".join(trsyl))
# Make last stressed syllable (acute or diaeresis) to the right of any hyphen
# unstressed (unless the hyphen is word-final); leave graves alone. We don't
# destress a syllable to the left of a hyphen unless the hyphen is word-final
# (i.e. a prefix). Otherwise e.g. the accents in the first part of words like
# ко́е-како́й and а́льфа-лу́ч won't remain.
# NOTE: Translit must already be decomposed! See comment at top.
def make_unstressed_once(ru, tr=None):
return map_last_hyphenated_component(make_unstressed_once_after_hyphen_split, ru, tr)
def make_unstressed_once_at_beginning_ru(word):
# leave graves alone
return re.sub("^([^́̈ёЁ]*)([́̈ёЁ])", lambda m: m.group(1) + destresser[m.group(2)], word, 1)
# Make first stressed syllable (acute or diaeresis) unstressed; leave
# graves alone; if NOCONCAT, return individual syllables.
# NOTE: Translit must already be decomposed! See comment at top.
def make_unstressed_once_at_beginning(ru, tr=None, noconcat=False):
if not tr:
return make_unstressed_once_at_beginning_ru(ru), None
# In the presence of TR, we need to do things the hard way, as with
# make_unstressed().
rusyl, trsyl = split_syllables(ru, tr)
for i in range(len(rusyl)):
stressed = is_stressed(rusyl[i])
if stressed:
if re.search("[ёЁ]", rusyl[i]):
trsyl[i] = trsyl[i].replace("o", "e").replace("O", "E")
rusyl[i] = make_unstressed_once_at_beginning_ru(rusyl[i])
# the following should still work as it will affect accents only
trsyl[i] = make_unstressed_once_at_beginning_ru(trsyl[i])
break
if noconcat:
return rusyl, trsyl
# Also need to apply j correction as otherwise we'll have je after cons
return "".join(rusyl), j_correction("".join(trsyl))
# Subfunction of make_ending_stressed(), make_beginning_stressed(), which
# add an acute accent to a syllable that may already have a grave accent;
# in such a case, remove the grave.
# NOTE: Translit must already be decomposed! See comment at top.
def correct_grave_acute_clash(word, tr=None):
word = re.sub("([̀ѐЀѝЍ])́", lambda m: grave_deaccenter[m.group(1)] + AC, word)
word = word.replace(AC + GR, AC)
if not tr:
return word, None
assert_decomposed(tr)
tr = tr.replace(GR + AC, AC)
tr = tr.replace(AC + GR, AC)
return word, tr
def make_ending_stressed_ru(word):
# If already ending stressed, just return word so we don't mess up ё
if is_ending_stressed(word):
return word
# Destress the last stressed syllable
word = make_unstressed_once_ru(word)
# Add an acute to the last syllable
word = re.sub("([" + vowel_no_jo + "])([^" + vowel + "]*)$", r"\1́\2", word)
# If that caused an acute and grave next to each other, remove the grave
return correct_grave_acute_clash(word)[0]
# Remove the last primary stress from the word and put it on the final
# syllable. Leave grave accents alone except in the last syllable.
# If final syllable already has primary stress, do nothing.
# NOTE: Translit must already be decomposed! See comment at top.
def make_ending_stressed_after_hyphen_split(ru, tr):
if not tr:
return make_ending_stressed_ru(ru), None
# If already ending stressed, just return ru/tr so we don't mess up ё
if is_ending_stressed(ru):
return ru, tr
# Destress the last stressed syllable; pass in "noconcat" so we get
# the individual syllables back
rusyl, trsyl = make_unstressed_once_after_hyphen_split(ru, tr, "noconcat")
# Add an acute to the last syllable of both Russian and translit
rusyl[-1] = re.sub("([" + vowel_no_jo + "])", r"\1" + AC, rusyl[-1])
trsyl[-1] = re.sub("([" + tr_vowel + "])", r"\1" + AC, trsyl[-1])
# If that caused an acute and grave next to each other, remove the grave
rusyl[-1], trsyl[-1] = correct_grave_acute_clash(rusyl[-1], trsyl[-1])
# j correction didn't get applied in make_unstressed_once because
# we short-circuited it and made it return lists of syllables
return "".join(rusyl), j_correction("".join(trsyl))
# Remove the last primary stress from the portion of the word to the right of
# any hyphen (unless the hyphen is word-final) and put it on the final
# syllable. Leave grave accents alone except in the last syllable. If final
# syllable already has primary stress, do nothing. (See make_unstressed_once()
# for why we don't affect stresses to the left of a hyphen.)
# NOTE: Translit must already be decomposed! See comment at top.
def make_ending_stressed(ru, tr=None):
return map_last_hyphenated_component(make_ending_stressed_after_hyphen_split, ru, tr)
def make_beginning_stressed_ru(word):
# If already beginning stressed, just return word so we don't mess up ё
if is_beginning_stressed(word):
return word
# Destress the first stressed syllable
word = make_unstressed_once_at_beginning_ru(word)
# Add an acute to the first syllable
word = re.sub("^([^" + vowel + "]*)([" + vowel_no_jo + "])", r"\1\2́", word)
# If that caused an acute and grave next to each other, remove the grave
return correct_grave_acute_clash(word)[0]
# Remove the first primary stress from the word and put it on the initial
# syllable. Leave grave accents alone except in the first syllable.
# If initial syllable already has primary stress, do nothing.
# NOTE: Translit must already be decomposed! See comment at top.
def make_beginning_stressed(ru, tr=None):
if not tr:
return make_beginning_stressed_ru(ru), None
# If already beginning stressed, just return ru/tr so we don't mess up ё
if is_beginning_stressed(ru):
return ru, tr
# Destress the first stressed syllable; pass in "noconcat" so we get
# the individual syllables back
rusyl, trsyl = make_unstressed_once_at_beginning(ru, tr, "noconcat")
# Add an acute to the first syllable of both Russian and translit
rusyl[0] = re.sub("([" + vowel_no_jo + "])", r"\1" + AC, rusyl[0])
trsyl[0] = re.sub("([" + tr_vowel + "])", r"\1" + AC, trsyl[0])
# If that caused an acute and grave next to each other, remove the grave
rusyl[0], trsyl[0] = correct_grave_acute_clash(rusyl[0], trsyl[0])
# j correction didn't get applied in make_unstressed_once_at_beginning
# because we short-circuited it and made it return lists of syllables
return "".join(rusyl), j_correction("".join(trsyl))
def try_to_stress(word):
if is_unaccented(word) and is_monosyllabic(word):
return make_ending_stressed(word)
else:
return word
def tr_try_to_stress(word):
if is_tr_unaccented(word) and is_tr_monosyllabic(word):
# FIXME, won't work, make_ending_stressed() needs to take both ru and tr, see Lua
#return make_tr_ending_stressed(word)
return unicodedata.normalize("NFC",
re.sub("([" + tr_vowel + "])([^" + + "]*)$", r"\1́\2", word))
else:
return word
def reduce_stem(stem):
m = re.search("^(.*)([оОеЕёЁ])́?([" + cons + "]+)$", stem)
if not m:
return None
pre, letter, post = m.groups()
if letter in "оО":
if post in "йЙ":
return None # FIXME, is this correct?
letter = ""
else:
is_upper = post in uppercase
if re.search("[" + vowel + "]́?$", pre):
letter = is_upper and "Й" or "й"
elif post in "йЙ":
letter = is_upper and "Ь" or "ь"
post = ""
elif ((post in velar and pre in cons_except_sib_c) or
(post not in "йЙ" + velar and re.search("[лЛ]$", pre))):
letter = is_upper and "Ь" or "ь"
else:
letter = ""
stem = pre + letter + post
return stem
def dereduce_stem(stem, epenthetic_stress):
if epenthetic_stress:
stem = make_unstressed_once(stem)
m = re.search("^(.*)([" + cons + "])([" + cons + "])$", stem)
if not m:
return None
pre, letter, post = m.groups()
is_upper = post in uppercase
if letter in "ьйЬЙ":
letter = ""
if post in "цЦ" or not epenthetic_stress:
epvowel = is_upper and "Е" or "е"
else:
epvowel = is_upper and "Ё" or "ё"
elif letter in cons_except_sib_c and post in velar or letter in velar:
epvowel = is_upper and "О" or "о"
elif post in "цЦ":
epvowel = is_upper and "Е" or "е"
elif epenthetic_stress:
if letter in sib:
epvowel = is_upper and "О́" or "о́"
else:
epvowel = is_upper and "Ё" or "ё"
else:
epvowel = is_upper and "Е" or "е"
stem = pre + letter + epvowel + post
if epenthetic_stress:
stem = make_ending_stressed(stem)
return stem
def add_soft_sign(stem):
if re.search("[" + vowel + "]$", stem):
return stem + "й"
else:
return stem + "ь"
def add_hard_neuter(stem):
if re.search("[" + sib_c + "]$", stem):
return stem + "е"
else:
return stem + "о"
def split_russian_tr(arg):
if "//" in arg:
return re.split("//", arg)
else:
return arg, None
def paste_russian_tr(ru, tr):
if tr:
return "%s//%s" % (ru, tr)
else:
return ru
# Given an ru-noun+ or ru-proper noun+ template, fetch the arguments
# associated with it. May return None if an error occurred in template
# expansion.
def fetch_noun_args(t, expand_text, forms_only=False):
generate_template = ("ru-generate-noun-forms" if forms_only else
"ru-generate-noun-args")
if str(t.name) == "ru-noun+":
generate_template = re.sub(r"^\{\{ru-noun\+",
"{{%s" % generate_template, str(t))
else:
generate_template = re.sub(r"^\{\{ru-proper noun\+",
"{{%s|ndef=sg" % generate_template, str(t))
generate_result = expand_text(generate_template)
if not generate_result:
return None
return blib.split_generate_args(generate_result)
# Given an ru-noun+ or ru-proper noun+ template, fetch the lemma, which
# is of the form of one or more terms separted by commas, where each
# term is either a Cyrillic word or words, or a combination CYRILLIC/LATIN
# with manual transliteration. May return None if an error occurred
# in template expansion.
def fetch_noun_lemma(t, expand_text):
# FIXME, probably not necessary to specify forms_only=True
args = fetch_noun_args(t, expand_text, forms_only=True)
if args is None:
return None
return args["nom_sg"] if "nom_sg" in args else args["nom_pl"]
# Given a list of form values, each of which is a tuple (RUSSIAN, TRANSLIT)
# where the TRANSLIT may be None or the empty string (in both cases treated
# as missing), group by RUSSIAN to handle cases where multiple translits are
# possible, generate any missing translits and join by commas. Return the list
# of form values, in the same order except with multiple translits combined.
def group_translits(formvals, pagemsg, verbose=False):
# Group formvals by Russian, to group multiple translits
formvals_by_russian = OrderedDict()
for formvalru, formvaltr in formvals:
if formvalru in formvals_by_russian:
formvals_by_russian[formvalru].append(formvaltr)
else:
formvals_by_russian[formvalru] = [formvaltr]
formvals = []
# If there is more than one translit, then generate the
# translit for any missing translit and join by commas
for russian, translits in formvals_by_russian.items():
if len(translits) == 1:
formvals.append((russian, translits[0]))
else:
manual_translits = []
for translit in translits:
if translit:
manual_translits.append(translit)
else:
translit = xlit_text(russian, pagemsg, verbose)
if not translit:
pagemsg("WARNING: Error generating translit for %s" % russian)
else:
manual_translits.append(translit)
joined_manual_translits = ", ".join(manual_translits)
pagemsg("NOTE: For Russian %s, found multiple manual translits %s" %
(russian, joined_manual_translits))
formvals.append((russian, joined_manual_translits))
return formvals
def check_for_alt_yo_terms(text, pagemsg):
parsed = blib.parse_text(text)
for t in parsed.filter_templates():
tname = str(t.name)
if tname in ["ru-adj-alt-ё", "ru-noun-alt-ё", "ru-proper noun-alt-ё",
"ru-verb-alt-ё", "ru-pos-alt-ё"]:
pagemsg("Skipping alt-ё term")
return True
return False
def find_defns(text):
return blib.find_defns(text, 'ru')
################################ Test code ##########################
num_failed = 0
num_succeeded = 0
def test(actual, expected_ru, expected_tr):
global num_succeeded, num_failed
if type(actual) is tuple:
actual_ru, actual_tr = actual
else:
actual_ru = actual
actual_tr = None
if actual_ru == expected_ru and actual_tr == expected_tr:
print("(%s, %s) == (%s, %s): TEST SUCCEEDED." %
(actual_ru, actual_tr, expected_ru, expected_tr))
num_succeeded += 1
else:
print("(%s, %s) != (%s, %s): TEST FAILED." %
(actual_ru, actual_tr, expected_ru, expected_tr))
num_failed += 1
def run_tests():
global num_succeeded, num_failed
num_succeeded = 0
num_failed = 0
test(make_unstressed("де́лать"), "делать", None)
test(make_unstressed("де́лать", decompose("délat")), "делать", "delat")
test(make_unstressed("де́ла́ть"), "делать", None)
test(make_unstressed("де́ла́ть", decompose("délát")), "делать", "delat")
test(make_unstressed("дёлать"), "делать", None)
test(make_unstressed("дёлать", decompose("djólat")), "делать", "delat")
test(make_unstressed("дйо́лать"), "дйолать", None)
test(make_unstressed("дйо́лать", decompose("djólat")), "дйолать", "djolat")
test(make_unstressed_once("де́лать"), "делать", None)
test(make_unstressed_once("де́лать", decompose("délat")), "делать", "delat")
test(make_unstressed_once("дела́ть"), "делать", None)
test(make_unstressed_once("дела́ть", decompose("delát")), "делать", "delat")
test(make_unstressed_once("де́ла́ть"), "де́лать", None)
test(make_unstressed_once("де́ла́ть", decompose("délát")), "де́лать", decompose("délat"))
test(make_unstressed_once("дёлать"), "делать", None)
test(make_unstressed_once("дёлать", decompose("djólat")), "делать", "delat")
test(make_unstressed_once("дйо́лать"), "дйолать", None)
test(make_unstressed_once("дйо́лать", decompose("djólat")), "дйолать", "djolat")
test(make_unstressed_once("ко́е-как"), "ко́е-как", None)
test(make_unstressed_once("ко́е-как", decompose("kóe-kak")), "ко́е-как", decompose("kóe-kak"))
test(make_unstressed_once_at_beginning("де́лать"), "делать", None)
test(make_unstressed_once_at_beginning("де́лать", decompose("délat")), "делать", "delat")
test(make_unstressed_once_at_beginning("дела́ть"), "делать", None)
test(make_unstressed_once_at_beginning("дела́ть", decompose("delát")), "делать", "delat")
test(make_unstressed_once_at_beginning("де́ла́ть"), "дела́ть", None)
test(make_unstressed_once_at_beginning("де́ла́ть", decompose("délát")), "дела́ть", decompose("delát"))
test(make_unstressed_once_at_beginning("дёлать"), "делать", None)
test(make_unstressed_once_at_beginning("дёлать", decompose("djólat")), "делать", "delat")
test(make_unstressed_once_at_beginning("дйо́лать"), "дйолать", None)
test(make_unstressed_once_at_beginning("дйо́лать", decompose("djólat")), "дйолать", "djolat")
test(make_ending_stressed("де́лать"), "дела́ть", None)
test(make_ending_stressed("де́лать", decompose("délat")), "дела́ть", decompose("delát"))
test(make_ending_stressed("де́ла́ть"), "де́ла́ть", None)
test(make_ending_stressed("де́ла́ть", decompose("délát")), "де́ла́ть", decompose("délát"))
test(make_ending_stressed("да̀ла́лать"), "да̀лала́ть", None)
test(make_ending_stressed("да̀ла́лать", decompose("dàlálat")), "да̀лала́ть", decompose("dàlalát"))
test(make_ending_stressed("ко́е-как"), "ко́е-ка́к", None)
test(make_ending_stressed("ко́е-как", decompose("kóe-kak")), "ко́е-ка́к", decompose("kóe-kák"))
test(make_beginning_stressed("дела́ть"), "де́лать", None)
test(make_beginning_stressed("дела́ть", decompose("delát")), "де́лать", decompose("délat"))
test(make_beginning_stressed("де́ла́ть"), "де́ла́ть", None)
test(make_beginning_stressed("де́ла́ть", decompose("délát")), "де́ла́ть", decompose("délát"))
test(make_beginning_stressed("да̀ла́ть"), "да́лать", None)
test(make_beginning_stressed("да̀ла́ть", decompose("dàlát")), "да́лать", decompose("dálat"))
# Final results
print("RESULTS: %s SUCCEEDED, %s FAILED." % (num_succeeded, num_failed))
if __name__ == "__main__":
run_tests()