/
add_noun_decl.py
964 lines (887 loc) · 40.5 KB
/
add_noun_decl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Fix ru-noun headers to be ru-noun+ and ru-proper noun to ru-proper noun+
# for multiword nouns by looking up the individual declensions of the words.
# Example page:
#
# ==Russian==
#
# ===Pronunciation===
# * {{ru-IPA|са́харная ва́та}}
#
# ===Noun===
# {{ru-noun|[[сахарный|са́харная]] [[вата|ва́та]]|f-in}}
#
# # [[cotton candy]], [[candy floss]], [[fairy floss]]
#
# ====Declension====
# {{ru-decl-noun-see|сахарный|вата}}
#
# [[Category:ru:Foods]]
# FIXME:
#
# 1. (DONE, NEEDS TESTING) Warnings like this should be fixable:
# Page 99 Дедушка Мороз: WARNING: Can't sub word link [[мороз|Моро́з]] into decl lemma моро́з
# 2. (DONE) This warning should be fixable:
# Page 756 десертное вино: WARNING: case nom_sg, existing forms [[десе́ртный|десе́ртное]] [[вино́]] not same as proposed [[десертный|десе́ртное]] [[вино́]]
# 3. (DONE, DEFINITELY NEEDS TESTING) Plural nouns
# 4. (DONE, NEEDS TESTING) Multiple inflected nouns, esp. in hyphenated compounds
# 5. (DONE) Don't choke when found notes= as long as there's only one
# (choke if multiple because the footnote symbols might be duplicated),
# instead issue warning
# 6. (DONE) Check that all parts of ru-decl-noun-see are used, error if not
# 7. (DONE) Handle all_parts_declined
# 8. Check on гей-брак, do both parts decline?
# 9. If there's a loc with на or в or something similar, warn about it because
# it may not convert well as a single-word override, cf. ось зла
# 10. (DONE) Implement use_given_page_decl
# 11. (DONE) Adding declension to proper nouns, should use n=sg if proper noun
# is singular-only
import pywikibot, re, sys, argparse
import blib
from blib import getparam, rmparam, msg, site
import rulib
import runounlib
# [singular ending, plural ending, gender, requires special case (1)]
pl_data = [
["", "ы", "m", False],
["", "и", "m", False],
["ь", "и", "mf", False],
["й", "и", "m", False],
["", "а", "m", True],
["а", "ы", "f", False],
["а", "и", "f", False],
["я", "и", "f", False],
["о", "а", "n", False],
["е", "а", "n", False],
["е", "я", "n", False],
["о", "и", "n", True]
]
infer_adj_lemma = [
["ая", "ый"],
["а́я", "о́й"],
["яя", "ий"],
["ое", "ый"],
["о́е", "о́й"],
["ее", "ий"],
]
consonant_re = "[бдфгклмнпрствхзшщчжц]"
particles = [
# List of prepositions and particles, from ru-pron.lua
"по", "в", "на", "до",
"без", "близ", "в", "во", "до",
"из-под", "из-за", "за", "из", "изо",
"к", "ко", "меж", "на", "над", "надо", "о", "об", "обо", "от",
"по", "под", "подо", "пред", "предо", "при", "про", "перед", "передо",
"через", "с", "со", "у", "не",
# Others
"и", "де"
]
# List of words where we use the specified declension, to deal with cases
# where there are multiple declensions; we have to be careful here to make
# sure more than one declension isn't actually used in different lemmas
use_given_decl = {"туз": "{{ru-noun-table|b}}",
"род": "{{ru-noun-table|e}}",
"лев": "{{ru-noun-table|b||*|a=an}}",
"ключ": "{{ru-noun-table|b}}",
"плата": "{{ru-noun-table|пла́та}}",
"брак": "{{ru-noun-table}}",
}
use_given_page_decl = {
"двоюродный дед": {"дед":"{{ru-noun-table|a=an}}"},
"двоюродный дядя": {"дядя":"{{ru-noun-table|дя́дя|(2)|or|c|дя́дя|-ья|a=an}}"},
"шах и мат": {"мат":"{{ru-noun-table}}"},
"ионический ордер": {"ордер":"{{ru-noun-table|о́рдер|or|c||(1)}}"},
"ионический орден": {"орден":"{{ru-noun-table|c|о́рден|(1)}}"},
"коринфский ордер": {"ордер":"{{ru-noun-table|о́рдер|or|c||(1)}}"},
"коринфский орден": {"орден":"{{ru-noun-table|c|о́рден|(1)}}"},
"корпус турбины": {"корпус":"{{ru-noun-table|ко́рпус}}"},
"бронирование кабины": {"бронирование":"{{ru-noun-table|бронирова́ние}}"},
"троюродный дядя": {"дядя":"{{ru-noun-table|дя́дя|(2)|or|c|дя́дя|-ья|a=an}}"},
"половой орган": {"орган":"{{ru-noun-table|о́рган}}"},
"вес нетто": {"вес":"{{ru-noun-table|c||(1)}}"},
"древесный уголь": {"уголь":"{{ru-noun-table|a,b|у́голь|m*}}"},
"ось зла": {"ось":"{{ru-noun-table|f''||f|loc=на +}}"},
"свет очей": {"свет":"{{ru-noun-table|par=све́ту|loc=свету́|n=sg}}"},
"дорожный чек": {"чек":"{{ru-noun-table}}"},
"зелёный лук": {"лук":"{{ru-noun-table}}"},
"воздушное судно": {"судно":"{{ru-noun-table|c|су́дно|(2)|суд}}"},
"Пепельная среда": {"среда":"{{ru-noun-table|f|среда́}}"},
"зелёный свет": {"свет":"{{ru-noun-table|par=све́ту|loc=свету́|n=sg}}"},
"окружающая среда": {"среда":"{{ru-noun-table|d|среда́}}"},
"парусное судно": {"судно":"{{ru-noun-table|c|су́дно|(2)|суд}}"},
"барабанный бой": {"бой":"{{ru-noun-table|c|loc=бою́}}"},
"ордер на арест": {"ордер":"{{ru-noun-table|c|о́рдер|(1)}}"},
"чёрная американка": {"американка":"{{ru-noun-table|америка́нка|*|a=an}}"},
"красный свет": {"свет":"{{ru-noun-table|par=све́ту|loc=свету́|n=sg}}"},
"жёлтый свет": {"свет":"{{ru-noun-table|par=све́ту|loc=свету́|n=sg}}"},
"амарантовый цвет": {"цвет":"{{ru-noun-table|c||(1)|par=+}}"},
"противоположный пол": {"пол":"{{ru-noun-table|e}}"},
"звуковая волна": {"волна":"{{ru-noun-table|f,d|волна́}}"},
"ночной клуб": {"клуб":"{{ru-noun-table}}"},
"правоохранительные органы": {"орган":"{{ru-noun-table|о́рган}}"},
"негласное правило": {"правило":"{{ru-noun-table|пра́вило}}"},
"степная рысь": {"рысь":"{{ru-noun-table||f|a=an}}"},
"ход конём": {"ход":"{{ru-noun-table|c|n=sg|par=+|loc=в +,на +}}"},
"Ростов-на-Дону": {"Ростов":"{{ru-noun-table|Росто́в|n=sg}}"},
}
allow_no_inflected_noun = [
"крайний нападающий",
"придыхательный согласный",
"разрисованный Пикассо",
"Пикассо прямоугольчатый",
"сербско-хорватский",
]
is_short_adj = [
"ахиллесов",
"крокодилов"
]
is_uninflected = [
"фибоначчи",
]
all_parts_declined = [
"э оборотное",
"апельсиновый сок",
"бульбоуретральная железа",
"земляной волк",
"отложительный падеж",
"снежный человек",
"крайний нападающий",
"шапка-невидимка",
]
keep_locative = [
"социальная сеть",
"Западный берег реки Иордан",
"Западный берег"
]
def process_text_on_page(index, pagetitle, text):
global args
def pagemsg(txt):
msg("Page %s %s: %s" % (index, pagetitle, txt))
def expand_text(tempcall):
return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)
subpagetitle = re.sub("^.*:", "", pagetitle)
notes = []
parsed = blib.parse_text(text)
# Find the declension arguments for LEMMA and inflected form INFL,
# the WORDINDth word in the expression. Return value is a tuple of
# four items: a list of (NAME, VALUE) tuples for the arguments, whether
# the word is an adjective, the value of n= (if given), and the value
# of a= (if given).
def find_decl_args(lemma, infl, wordind):
declpage = pywikibot.Page(site, lemma)
if rulib.remove_accents(infl) == lemma:
wordlink = "[[%s]]" % infl
else:
wordlink = "[[%s|%s]]" % (lemma, infl)
if not declpage.exists():
if lemma in is_short_adj or re.search("(ий|ый|ой)$", lemma):
pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" %
(wordind, lemma, infl))
return [("1", wordlink), ("2", "+")], True, None, None
else:
pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" %
(wordind, lemma, infl))
return None
parsed = blib.parse_text(declpage.text)
decl_templates = []
headword_templates = []
decl_z_templates = []
for t in parsed.filter_templates():
tname = str(t.name)
if tname in ["ru-noun-table", "ru-decl-adj"]:
pagemsg("find_decl_args: Found decl template: %s" % str(t))
decl_templates.append(t)
if tname in ["ru-noun", "ru-proper noun"]:
pagemsg("find_decl_args: Found headword template: %s" % str(t))
headword_templates.append(t)
if tname in ["ru-decl-noun-z"]:
pagemsg("find_decl_args: Found z-decl template: %s" % str(t))
decl_z_templates.append(t)
if not decl_templates:
if decl_z_templates:
# {{ru-decl-noun-z|звезда́|f-in|d|ё}}
# {{ru-decl-noun-z|ёж|m-inan|b}}
if len(decl_z_templates) > 1:
pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
(wordind, lemma, infl))
return None
else:
decl_z_template = decl_z_templates[0]
headword_template = None
pagemsg("find_decl_args: Using z-decl template: %s" %
str(decl_z_template))
if len(headword_templates) == 0:
pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" %
(wordind, lemma, infl, str(decl_z_template)))
elif len(headword_templates) > 1:
pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" %
(wordind, lemma, infl, str(decl_z_template)))
else:
headword_template = headword_templates[0]
pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" %
(wordind, lemma, infl, str(headword_template),
str(decl_z_template)))
decl_template = runounlib.convert_zdecl_to_ru_noun_table(decl_z_template,
subpagetitle, pagemsg, headword_template=headword_template)
decl_templates = [decl_template]
elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
x for x in headword_templates if getparam(x, "3") == "-"]:
return [("1", wordlink), ("2", "$")], False, None, None
else:
pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
(wordind, lemma, infl))
return None
if len(decl_templates) == 1:
decl_template = decl_templates[0]
else:
# Multiple decl templates
for t in decl_templates:
if str(t.name) == "ru-decl-adj" and re.search("(ий|ый|ой)$", lemma):
pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" %
(wordind, lemma, infl))
decl_template = t
break
else:
if lemma in use_given_decl:
overriding_decl = use_given_decl[lemma]
pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
(wordind, overriding_decl, lemma, infl))
decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
elif pagetitle in use_given_page_decl:
overriding_decl = use_given_page_decl[pagetitle].get(lemma, None)
if not overriding_decl:
pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" %
(wordind, lemma, infl))
return
else:
pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
(wordind, overriding_decl, lemma, infl))
decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
else:
pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" %
(wordind, lemma, infl))
return None
pagemsg("find_decl_args: Using decl template: %s" % str(decl_template))
if str(decl_template.name) == "ru-decl-adj":
if re.search(r"\bь\b", getparam(decl_template, "2"), re.U):
return [("1", wordlink), ("2", "+ь")], True, None, None
else:
return [("1", wordlink), ("2", "+")], True, None, None
# ru-noun-table
assert str(decl_template.name) == "ru-noun-table"
# Split out the arg sets in the declension and check the
# lemma of each one, taking care to handle cases where there is no lemma
# (it would default to the page name).
highest_numbered_param = 0
for p in decl_template.params:
pname = str(p.name)
if re.search("^[0-9]+$", pname):
highest_numbered_param = max(highest_numbered_param, int(pname))
# Now gather the numbered arguments into arg sets. Code taken from
# ru-noun.lua.
offset = 0
arg_sets = []
arg_set = []
for i in range(1, highest_numbered_param + 2):
end_arg_set = False
val = getparam(decl_template, str(i))
if i == highest_numbered_param + 1:
end_arg_set = True
elif val == "_" or val == "-" or re.search("^join:", val):
pagemsg("WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
(wordind, lemma, infl))
return None
elif val == "or":
end_arg_set = True
if end_arg_set:
arg_sets.append(arg_set)
arg_set = []
offset = i
else:
arg_set.append(val)
canon_infl = rulib.remove_accents(infl).lower()
canon_lemma = lemma.lower()
ispl = False
need_sc1 = False
found_gender = None
if canon_infl != canon_lemma:
for sgend, plend, gender, is_sc1 in pl_data:
if sgend:
check_sgend = sgend
else:
check_sgend = consonant_re
if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma):
ispl = True
found_gender = gender
need_sc1 = is_sc1
break
else:
pagemsg("WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" %
(wordind, lemma, infl))
return None
# Substitute the wordlink for any lemmas in the declension.
# If plural, also add gender and verify special case (1) as necessary.
# Concatenate all the numbered params, substituting the wordlink into
# the lemma as necessary.
numbered_params = []
for arg_set in arg_sets:
lemma_arg = 0
if len(arg_set) > 0 and runounlib.arg1_is_stress(arg_set[0]):
lemma_arg = 1
if len(arg_set) <= lemma_arg:
arg_set.append("")
arglemma = arg_set[lemma_arg]
manualtr = ""
if "//" in arglemma:
arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups()
if (not arglemma or arglemma.lower() == infl.lower() or
rulib.is_monosyllabic(infl) and rulib.remove_accents(arglemma).lower() ==
rulib.remove_accents(infl).lower() or
ispl and rulib.remove_accents(arglemma).lower() == lemma.lower()
):
arg_set[lemma_arg] = wordlink + manualtr
else:
pagemsg("WARNING: Can't sub word link %s into decl lemma %s%s" % (
wordlink, arg_set[lemma_arg], ispl and ", skipping" or ""))
if ispl:
return None
if ispl:
# Add the gender
if len(arg_set) <= lemma_arg + 1:
arg_set.append("")
declarg = arg_set[lemma_arg + 1]
# First, sub in gender
m = re.search("(3f|[mfn])", declarg)
if found_gender == "mf":
if not m:
pagemsg("WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" %
(wordinfl, lemma, infl))
return None
decl_gender = m.group(1)
if decl_gender == "n":
pagemsg("WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" %
(wordinfl, lemma, infl))
return None
elif decl_gender in ["m", "3f"]:
pagemsg("Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
(decl_gender, wordind, lemma, infl))
else:
assert gender == "f"
pagemsg("Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" %
(wordind, lemma, infl))
declarg = re.sub("f", "3f", declarg, 1)
else:
if m:
decl_gender = m.group(1)
if decl_gender == found_gender:
pagemsg("Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
(found_gender, wordind, lemma, infl))
else:
pagemsg("WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" %
(decl_gender, wordind, found_gender, lemma, infl))
declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1)
else:
pagemsg("No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" %
(wordind, found_gender, lemma, infl))
declarg = found_gender + declarg
# Now check special case 1
if need_sc1 != ("(1)" in declarg):
if need_sc1:
pagemsg("WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % (
wordind, declarg, lemma, infl))
return None
else:
pagemsg("WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % (
wordind, declarg, lemma, infl))
return None
arg_set[lemma_arg + 1] = declarg
if numbered_params:
numbered_params.append("or")
numbered_params.extend(arg_set)
# Now gather all params, including named ones.
params = []
params.extend((str(i+1), val) for i, val in zip(range(len(numbered_params)), numbered_params))
num = None
anim = None
for p in decl_template.params:
pname = str(p.name)
val = str(p.value)
if pname == "a":
anim = val
elif pname == "n":
num = val
elif pname == "notes":
params.append((pname, val))
elif pname == "title":
pagemsg("WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" %
(wordind, lemma, infl, val))
elif re.search("^[0-9]+$", pname):
pass
else:
keepparam = True
if pname == "loc":
if pagetitle in keep_locative:
pagemsg("Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % (
wordind, val, lemma, infl))
else:
pagemsg("WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % (
wordind, val, lemma, infl))
keepparam = False
if pname == "par":
pagemsg("WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % (
wordind, val, lemma, infl))
keepparam = False
if pname == "voc":
pagemsg("WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % (
wordind, val, lemma, infl))
keepparam = False
if keepparam:
if pname == "loc" and re.search(r"^(на|в)\b", val, re.U):
pagemsg("WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" %
(wordind, val, lemma, infl))
pname += str(wordind)
params.append((pname, val))
return params, False, num, anim
headword_template = None
see_template = None
for t in parsed.filter_templates():
tname = str(t.name)
if tname == "ru-decl-noun-see":
if see_template:
pagemsg("WARNING: Multiple ru-decl-noun-see templates, skipping")
return
see_template = t
if tname in ["ru-noun+", "ru-proper noun+"]:
pagemsg("Found %s, skipping" % tname)
return
if tname in ["ru-noun", "ru-proper noun"]:
if headword_template:
pagemsg("WARNING: Multiple ru-noun or ru-proper noun templates, skipping")
return
headword_template = t
if tname == "ru-pre-reform":
pagemsg("WARNING: Found ru-pre-reform template, skipping")
return
if not headword_template:
pagemsg("WARNING: Can't find headword template, skipping")
return
pagemsg("Found headword template: %s" % str(headword_template))
headword_is_proper = str(headword_template.name) == "ru-proper noun"
if getparam(headword_template, "3") == "-" or "[[Category:Russian indeclinable nouns]]" in page.text:
pagemsg("WARNING: Indeclinable noun, skipping")
return
headword_trs = blib.fetch_param_chain(headword_template, "tr", "tr")
if headword_trs:
pagemsg("WARNING: Found headword manual translit, skipping: %s" %
",".join(headword_trs))
return
headword = getparam(headword_template, "1")
for badparam in ["head2", "gen2", "pl2"]:
val = getparam(headword_template, badparam)
if val:
pagemsg("WARNING: Found extra param, can't handle, skipping: %s=%s" % (
badparam, val))
return
# Here we use a capturing split, and treat what we want to capture as
# the splitting text, backwards from what you'd expect. The separators
# will fall at 0, 2, ... and the headwords as 1, 3, ... There will be
# an odd number of items, and the first and last should be empty.
headwords_separators = re.split(r"(\[\[.*?\]\]|[^ \-]+)", headword)
if headwords_separators[0] != "" or headwords_separators[-1] != "":
pagemsg("WARNING: Found junk at beginning or end of headword, skipping")
return
headwords = []
# Separator at index 0 is the separator that goes after the first word
# and before the second word.
separators = []
wordind = 0
# FIXME, Here we try to handle hyphens, but we'll still have problems with
# words like изба́-чита́льня with conjoined nouns, both inflected, because
# we assume only one inflected noun (should be fixable without too much
# work). We'll also have problems with e.g. пистолет-пулемёт Томпсона,
# because the words are linked individually but the ru-decl-noun-see
# has пистолет-пулемёт given as a single entry. We have a check below
# to try to catch this case, because no inflected nouns will show up.
for i in range(1, len(headwords_separators), 2):
hword = headwords_separators[i]
separator = headwords_separators[i+1]
if i < len(headwords_separators) - 2 and separator != " " and separator != "-":
pagemsg("WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>" %
(wordind + 1, hword, separator))
return
# Canonicalize link in headword
m = re.search(r"^\[\[([^\[\]|]+)\|([^\[\]|]+)\]\]$", hword)
if m:
lemma, infl = m.groups()
lemma = rulib.remove_accents(re.sub("#Russian$", "", lemma))
if lemma == rulib.remove_accents(infl):
hword = "[[%s]]" % infl
else:
hword = "[[%s|%s]]" % (lemma, infl)
headwords.append(hword)
separators.append(separator)
wordind += 1
pagemsg("Found headwords: %s" % " @@ ".join(headwords))
# Get headword genders (includes animacy and number)
genders = blib.fetch_param_chain(headword_template, "2", "g")
genders_include_pl = len([x for x in genders if re.search(r"\bp\b", x)]) > 0
# Extract lemmas and inflections for each word in headword
lemmas_infls = []
saw_unlinked_word = False
for word in headwords:
m = re.search(r"^\[\[([^\[\]|]+)\|([^\[\]|]+)\]\]$", word)
if m:
lemma, infl = m.groups()
else:
m = re.search(r"^\[\[([^\[\]|]+)\]\]$", word)
if m:
infl = m.group(1)
lemma = rulib.remove_accents(infl)
elif pagetitle in all_parts_declined:
infl = word
lemma = rulib.remove_accents(infl)
for inflsuffix, lemmasuffix in infer_adj_lemma:
if re.search(inflsuffix + "$", infl):
lemma = rulib.remove_accents(re.sub(inflsuffix + "$", lemmasuffix, infl))
lemma = re.sub("([кгхшжчщ])ый$", r"\1ий", lemma)
pagemsg("WARNING: Inferring adjectival lemma from inflection, please check: lemma=%s, infl=%s" %
(lemma, infl))
break
else:
pagemsg("WARNING: Assuming word is inflected adj or noun, please check: lemma=%s, infl=%s" %
(lemma, infl))
else:
infl = word
lemma = rulib.remove_accents(infl)
saw_unlinked_word = True
lemmas_infls.append((lemma, infl))
if see_template:
pagemsg("Found decl-see template: %s" % str(see_template))
inflected_words = set(rulib.remove_accents(blib.remove_links(str(x.value)))
for x in see_template.params)
if saw_unlinked_word:
pagemsg("WARNING: Unlinked word(s) in headword, found decl-see template, proceeding, please check: %s" % headword)
else:
# Try to figure out which words are inflected and which words aren't
pagemsg("No ru-decl-noun-see template, inferring which headword words are inflected")
if saw_unlinked_word:
pagemsg("WARNING: Unlinked word(s) in headword, no decl-see template, skipping: %s" % headword)
return
inflected_words = set()
saw_noun = False
reached_uninflected = False
wordind = 0
for word, lemmainfl in zip(headwords, lemmas_infls):
wordind += 1
is_inflected = False
lemma, infl = lemmainfl
canon_infl = rulib.remove_accents(infl).lower()
canon_lemma = lemma.lower()
if lemma in is_short_adj:
is_inflected = True
pagemsg("Assuming word #%s is short adjectival, inflected: lemma=%s, infl=%s" %
(wordind, lemma, infl))
if saw_noun:
pagemsg("WARNING: Word #%s is adjectival inflected and follows inflected noun: lemma=%s, infl=%s" %
(wordind, lemma, infl))
elif re.search("(ый|ий|ой)$", lemma):
if re.search("(ый|ий|о́й|[ая]́?я|[ое]́?е|[ыи]́?е|ь[яеи])$", infl):
is_inflected = True
pagemsg("Assuming word #%s is adjectival, inflected: lemma=%s, infl=%s" %
(wordind, lemma, infl))
if saw_noun:
pagemsg("WARNING: Word #%s is adjectival inflected and follows inflected noun: lemma=%s, infl=%s" %
(wordind, lemma, infl))
else:
pagemsg("Assuming word #%s is adjectival, uninflected: lemma=%s, infl=%s" %
(wordind, lemma, infl))
elif canon_lemma == canon_infl:
if canon_lemma in particles:
pagemsg("Assuming word #%s is an uninflected particle: lemma=%s, infl=%s" %
(wordind, lemma, infl))
elif canon_lemma in is_uninflected:
pagemsg("Assuming word #%s is an uninflected non-particle because listed as uninflected: lemma=%s, infl=%s" %
(wordind, lemma, infl))
else:
is_inflected = True
pagemsg("Assuming word #%s is noun, inflected: lemma=%s, infl=%s" %
(wordind, lemma, infl))
if saw_noun:
if pagetitle in all_parts_declined:
pagemsg("Saw second apparently inflected noun at word #%s, allowed because pagetitle in all_parts_declined: lemma=%s, infl=%s" %
(wordind, lemma, infl))
else:
pagemsg("WARNING: Saw second apparently inflected noun at word #%s, skipping: lemma=%s, infl=%s" %
(wordind, lemma, infl))
return
else:
saw_noun = True
else:
# FIXME, be smarter about nouns conjoined with и, e.g. Адам и Ева,
# (might not be worth it, only five such nouns)
if genders_include_pl and not saw_noun and not reached_uninflected:
# Check for plural inflection
for sgend, plend, gender, is_sc1 in pl_data:
if sgend:
check_sgend = sgend
else:
check_sgend = consonant_re
if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma):
pagemsg("Assuming word #%s is plural noun, inflected: lemma=%s, infl=%s" %
(wordind, lemma, infl))
saw_noun = True
is_inflected = True
break
if not is_inflected:
pagemsg("Assuming word #%s is non-adjectival, uninflected: lemma=%s, infl=%s" %
(wordind, lemma, infl))
if not saw_noun:
pagemsg("WARNING: No inflected noun in headword, skipping: %s" %
headword)
return
if is_inflected:
if reached_uninflected:
if separators[wordind - 2] == "-":
# Cases like сербско-хорватский, Народно-Демократическая,
# Центрально-Африканская, военно-морские
pagemsg("WARNING: Word #%s is apparently inflected and follows uninflected word after hyphen, allowed, please check: lemma=%s, infl=%s" %
(wordind, lemma, infl))
else:
pagemsg("WARNING: Word #%s is apparently inflected and follows uninflected words, something might be wrong (or could be accusative after preposition), skipping: lemma=%s, infl=%s" %
(wordind, lemma, infl))
# FIXME, compile list where this is allowed
return
inflected_words.add(lemma)
else:
reached_uninflected = True
if lemma in inflected_words:
pagemsg("WARNING: Lemma appears both in inflected and uninflected words, can't handle skipping: lemma=%s (infl=%s at second appearance at word#%s)" %
(lemma, infl, wordind))
params = []
saw_noun = False
overall_num = None
overall_anim = None
wordind = 0
offset = 0
decl_notes = []
for word, lemmainfl in zip(headwords, lemmas_infls):
wordind += 1
lemma, infl = lemmainfl
# If not first word, add _ separator between words
if wordind > 1:
if separators[wordind - 2] == "-":
separator = "-"
elif separators[wordind - 2] == " ":
separator = "_"
else:
pagemsg("WARNING: Something wrong, separator for word #%2 isn't space or hyphen: <%s>" %
separators[wordind - 2])
return
params.append((str(offset + 1), separator))
offset += 1
if lemma in inflected_words:
inflected_words.remove(lemma)
pagemsg("Looking up declension for lemma %s, infl %s" % (lemma, infl))
retval = find_decl_args(lemma, infl, wordind)
if not retval:
pagemsg("WARNING: Can't get declension for %s, skipping" % headword)
return
wordparams, isadj, num, anim = retval
num_numbered_params = 0
if not isadj:
if saw_noun:
if wordind == 2 and len(headwords) == 2 and separator == "-":
pagemsg("WARNING: Found apparent coordinate noun headword A-B, using first noun for overall num and anim, please check")
elif see_template:
pagemsg("WARNING: Multiple inflected nouns with ru-decl-noun-see template, allowing but please check")
else:
pagemsg("WARNING: Multiple inflected nouns without ru-decl-noun-see template, can't handle, skipping")
return
else:
overall_num = num
overall_anim = anim
saw_noun = True
for name, val in wordparams:
if name == "notes":
decl_notes.append(val)
else:
if re.search("^[0-9]+$", name):
name = str(int(name) + offset)
num_numbered_params += 1
params.append((name, val))
offset += num_numbered_params
else:
# Invariable
if rulib.is_unstressed(infl):
word = "*" + word
if infl == "и":
pagemsg("WARNING: Found и, check number args")
params.append((str(offset + 1), word))
params.append((str(offset + 2), "$"))
offset += 2
if inflected_words:
pagemsg("WARNING: Some inflected words left over, something wrong, skipping: %s" %
", ".join(inflected_words))
return
if len(decl_notes) > 1:
pagemsg("WARNING: Found multiple notes=, can't handle, skipping: notes=%s" %
" // ".join("<%s>" % x for x in decl_notes))
return
elif len(decl_notes) == 1:
pagemsg("WARNING: Found notes=, need to check: notes=<%s>" % decl_notes[0])
params.append(("notes", decl_notes[0]))
if not saw_noun and not pagetitle in allow_no_inflected_noun:
pagemsg("WARNING: No inflected nouns, something might be wrong (e.g. the пистоле́т-пулемёт То́мпсона problem), can't handle, skipping")
return
if overall_anim in ["i", "in", "inan"] or not overall_anim:
overall_anim = "in"
elif overall_anim in ["a", "an", "anim"]:
overall_anim = "an"
elif overall_anim in ["b", "bi", "bian", "both"]:
overall_anim = "bi"
saw_in = -1
saw_an = -1
for i,g in enumerate(genders):
if re.search(r"\bin\b", g) and saw_in < 0:
saw_in = i
if re.search(r"\ban\b", g) and saw_an < 0:
saw_an = i
if saw_in >= 0 and saw_an >= 0 and saw_in < saw_an:
headword_anim = "ia"
elif saw_in >= 0 and saw_an >= 0:
headword_anim = "ai"
elif saw_an >= 0:
headword_anim = "an"
elif saw_in >= 0:
headword_anim = "in"
else:
headword_anim = overall_anim
if overall_anim != headword_anim:
pagemsg("WARNING: Overriding decl anim %s with headword anim %s" % (
overall_anim, headword_anim))
if headword_anim and headword_anim != "in":
params.append(("a", headword_anim))
if overall_num:
overall_num = overall_num[0:1]
canon_nums = {"s":"sg", "p":"pl", "b":"both"}
if overall_num in canon_nums:
overall_num = canon_nums[overall_num]
else:
pagemsg("WARNING: Bogus value for overall num in decl, skipping: %s" % overall_num)
return
if headword_is_proper:
plval = getparam(headword_template, "4")
if plval and plval != "-":
if overall_num != "both":
pagemsg("WARNING: Proper noun is apparently sg/pl but main noun not, skipping: %s" %
headword)
return
elif overall_num == "both":
pagemsg("WARNING: Proper noun has sg/pl main noun underlying it, assuming singular: %s" %
headword)
overall_num = None
elif overall_num == "sg":
overall_num = None
if overall_num:
params.append(("n", overall_num))
generate_template = (
blib.parse_text("{{ru-generate-noun-args}}").filter_templates()[0])
for name, value in params:
generate_template.add(name, value)
proposed_template_text = str(generate_template)
if headword_is_proper:
proposed_template_text = re.sub(r"^\{\{ru-generate-noun-args",
"{{ru-proper noun+", proposed_template_text)
else:
proposed_template_text = re.sub(r"^\{\{ru-generate-noun-args",
"{{ru-noun+", proposed_template_text)
proposed_decl = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
for param in generate_template.params:
proposed_decl.add(param.name, param.value)
def pagemsg_with_proposed(text):
pagemsg("Proposed new template (WARNING, omits explicit gender and params to preserve from old template): %s" % proposed_template_text)
pagemsg(text)
if headword_is_proper:
generate_template.add("ndef", "sg")
generate_result = expand_text(str(generate_template))
if not generate_result:
pagemsg_with_proposed("WARNING: Error generating noun args, skipping")
return
genargs = blib.split_generate_args(generate_result)
if headword_is_proper and genargs["n"] == "s" and not getparam(proposed_decl, "n"):
proposed_decl.add("n", "sg")
# This will check number mismatch (and animacy mismatch, but that shouldn't
# occur as we've taken the animacy directly from the headword)
new_genders = runounlib.check_old_noun_headword_forms(headword_template, genargs,
subpagetitle, pagemsg_with_proposed, laxer_comparison=True)
if new_genders == None:
return None
orig_headword_template = str(headword_template)
params_to_preserve = runounlib.fix_old_headword_params(headword_template,
params, new_genders, pagemsg_with_proposed)
if params_to_preserve == None:
return None
headword_template.params.extend(params_to_preserve)
notes = []
ru_noun_changed = 0
ru_proper_noun_changed = 0
if str(headword_template.name) == "ru-noun":
headword_template.name = "ru-noun+"
notes.append("convert multi-word ru-noun to ru-noun+ by looking up decls")
else:
headword_template.name = "ru-proper noun+"
notes.append("convert multi-word ru-proper noun to ru-proper noun+ by looking up decls")
pagemsg("Replacing headword %s with %s" % (orig_headword_template, str(headword_template)))
newtext = str(parsed)
if see_template:
orig_see_template = str(see_template)
del see_template.params[:]
see_template.name = "ru-noun-table"
for param in proposed_decl.params:
see_template.add(param.name, param.value)
pagemsg("Replacing see-template %s with decl %s" % (orig_see_template, str(see_template)))
notes.append("replace see-template with declension")
newtext = str(parsed)
else:
if "==Declension==" in newtext:
pagemsg("WARNING: No ru-decl-noun-see template, but found declension section, not adding new declension, proposed declension follows: %s" %
str(proposed_decl))
else:
nounsecs = re.findall("^===(?:Noun|Proper noun)===$", newtext, re.M)
if len(nounsecs) == 0:
pagemsg("WARNING: Found no noun sections, not adding new declension, proposed declension follows: %s" %
str(proposed_decl))
elif len(nounsecs) > 1:
pagemsg("WARNING: Found multiple noun sections, not adding new declension, proposed declension follows: %s" %
str(proposed_decl))
else:
text = newtext
newtext = re.sub(r"\n*$", "\n\n", newtext)
# Sub in after Noun or Proper noun section, before a following section
# (====Synonyms====) or a wikilink ([[pl:гонка вооружений]]) or
# a category ([[Category:...]]).
newtext = re.sub(r"^(===(?:Noun|Proper noun)===$.*?)^(==|\[\[|\Z)",
r"\1====Declension====\n%s\n\n\2" % str(proposed_decl), newtext,
1, re.M|re.S)
if text == newtext:
pagemsg("WARNING: Something wrong, can't sub in new declension, proposed declension follows: %s" %
str(proposed_decl))
else:
pagemsg("Subbed in new declension: %s" % str(proposed_decl))
notes.append("create declension from headword")
if args.verbose:
pagemsg("Replaced <%s> with <%s>" % (text, newtext))
return newtext, notes
parser = blib.create_argparser("Convert ru-noun to ru-noun+, ru-proper noun to ru-proper noun+ for multiword nouns",
include_pagefile=True, include_stdin=True)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)
refs = []
#for pos in ["proper nouns"]:
for pos in ["nouns", "proper nouns"]:
for refpage in ["Template:tracking/ru-headword/space-in-headword/%s" % pos,
"Template:tracking/ru-headword/hyphen-no-space-in-headword/%s" % pos]:
refs.append(refpage)
blib.do_pagefile_cats_refs(args, start, end, process_text_on_page, edit=True, stdin=True,
default_refs=refs)