def base_word(word): result = word result = result.replace('-', '').replace('|', '').replace('.', '').\ replace(u'·', '') result = remove_stress(result).strip() result = re.sub('<u>(.{1,2})</u>', '\\1', result) return result
def print_items(items): print for value, items in results.items(): print "— value:", remove_stress(value) for i in range(len(items)): item = items[i] if item: print "— ", i, item.get(u"лицо", "-") else: print "— ", i, item print
def tpl_action(self, page, tpl, title, morph, lang, params): title = title.strip() # empty_templates = [ # u"сущ ru m ina", u"сущ ru f ina", u"сущ ru n ina", # u"сущ ru m a", u"сущ ru f a", u"сущ ru n a", # ] # if title in empty_templates: # print page.title #if morph != u'сущ': # continue # print '-' * 80 # output # self.counter += 1 # if not self.counter % 50: # save_wiki_page(u"Участник:Vitalik/Словоформы/v2/А1/%s" % self.n, # self.content, u"Получение списка словоформ") # self.n += 1 # self.content = '' # # print "\n\n== [[%s]] ==" % page.title # self.content += "\n\n== [[%s]] ==\n" % page.title # print title # return #continue call_params, call_numeric = process_call_params(params) try: template = TemplateInflection.objects.get(title=title) except ObjectDoesNotExist: return # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ") if not template.forms: print title, '- maybe wrong template' return print title tpl_forms = get_dict_from_text(template.forms) tpl_params = get_dict_from_text(template.params) for key, value in tpl_params.items(): tpl_params[key] = universal_process_template(value, call_params) for key, value in tpl_forms.items(): value = universal_process_template(value, call_params) value = process_template(value, tpl_params, key, morph) value = divide_words(value) tpl_forms[key] = value form_results = dict() for key, values in tpl_forms.items(): if morph == u'сущ': if key == 'nom-sg': value = values[0] if remove_stress(value) and page.title != remove_stress(value): pass # todo: мсправить их все-таки # print # print "https://ru.wiktionary.org/wiki/%s" % urllib.quote_plus(page.title.encode('utf-8')) # print page.title # print remove_stress(value) # print repr(page.title) # print repr(remove_stress(value)) for key, values in tpl_forms.items(): # if re.search('[a-z]', ' '.join(values)): #todo! find them! # print values if not values or len(values) == 1 and not values[0]: continue form_params = get_form_params(morph, key, tpl_params) if not form_params: continue for value in values: if remove_stress(value) == page.title: continue form_results.setdefault(value, list()) form_results[value].append(form_params.copy()) # todo: если полностью совпали, то тоже удалять join_form_results(morph, form_results) db_forms = list() for value, items in form_results.items(): for form_params in items: if not form_params: continue form_template = u"{{Форма-%s\n|язык=ru\n|база=%s\n" % (morph, page.title) for param_name, param_value in form_params.items(): form_template += "|%s=%s\n" % (param_name, param_value) form_template += u"|слоги={{по-слогам|%s}}\n" % value form_template += "}}" # output # # print "'''[[%s]]'''" % remove_stress(value) # # print form_template # self.content += "'''[[%s]]'''\n" % remove_stress(value) # self.content += "%s\n" % form_template # # print # db_form, created = WordForm.objects.get_or_create( # # if created: # WordForm.objects.create( # title=remove_stress(value), # base=page.title, # value=value, # template=form_template # ) db_form = WordForm( title=remove_stress(value), base=page.title, value=value, template=form_template ) db_forms.append(db_form) # db_counter += 1 # if len(db_forms) > 1000: # WordForm.objects.bulk_create(db_forms) # print dt(), '> forms added:', db_counter # db_forms = [] WordForm.objects.bulk_create(db_forms)
def check_header(self, page, before, header, after): morphs = [ u"прилагательное", u"наречие", u"существительное", u"междометие", u"частица", u"глагол", u"местоимение", u"причастие", u"деепричастие", u"союз", u"числительное", u"предлог", ] morphs += map(lambda x: x.capitalize(), list(morphs)) title = page.title # .decode('utf-8') # print title line = "%s%s%s" % (before, header, after) header = header.strip() if before != after: return # это другая ошибка (syntax) need_level = header_levels.get(header) level = len(before) if level > 2: # if level == 4: # hs = header.strip() # if hs and len(hs) > 1 and hs[0] == '-' or hs[-1:] == '-': # return # это для подразделов блока "Родственные слова" if not need_level: # print # print title # print line return line elif level == 2: if need_level: return # это другая ошибка (level) if header in morphs: return if u"{{заголовок|" in header: return # использования шаблона [{заголовок}} — норм rtitle = ( title.replace("$", "\$") .replace(")", "\)") .replace("(", "\(") .replace("*", "\*") .replace(".", "\.") .replace("?", "\?") ) pattern = u"^%s(?P<num> [IV]+)?( \((?P<morph>[а-я]+)\))?$" % rtitle # print pattern # todo: liquide: == существительное II == # todo: list: ==list IV(глагол)== cleaned_header = remove_stress(header) latin_replacer = {u"ā": u"a", u"ē": u"e", u"ī": u"i", u"ō": u"o", u"ū": u"u"} for key, value in latin_replacer.items(): cleaned_header = cleaned_header.replace(key, value) m = re.match(pattern, cleaned_header, flags=re.UNICODE) if not m: # print # print title # print line return line morph = m.group("morph") if morph: if morph not in morphs: # print # print title # print line return line return
def try_to_get_infinitive(): print "try_to_get_infinitive()" i = 0 results = dict() for page in Page.objects.iterate(): title = page.title i += 1 # if i > 10000: # break if not i % 1000: print dt(), "processed pages:", i content = page.content p = re.compile( u"""(\{\{ (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+) # заголовок (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?) # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE, ) parts = p.findall(content) for part in parts: m = p.search(part[0]) if m: template_title = m.group("title").strip() morph = m.group("morph") if morph != u"гл": continue # print # print page.title # print template_title call_params = process_call_params(m.group("params")) base_params = dict() for key, value in call_params.items(): if key.startswith(u"основа"): base_params[key] = (value, remove_stress(value)) # print # print title for key in sorted(base_params.keys()): value, value_no_stress = base_params[key] if not value: # print '###', key break if title.startswith(value_no_stress): # print value_no_stress, replace_stress(value) suffix = title[len(value_no_stress) :] temp_replaced = replace_stress(value) if "'" not in temp_replaced: # print repr(temp_replaced) # print repr(suffix) if u"ё" not in title: suffix = re.sub(u"([аеиоуыэюя])", u"\g<1>́", suffix, count=1, flags=re.UNICODE) # print repr(suffix) # print temp_replaced + '|' + replace_stress(suffix), '' if key == u'основа' else '$' * 80 results.setdefault(template_title, list()) results[template_title].append((suffix, key)) break for template_title, data in results.items(): data = set(data) print template_title for suffix, key in data: print suffix, "" if key == u"основа" else "$" * 80 + key
def process_slovoforms_old(): print "process_slovoforms()" i = 0 keys = set() for page in Page.objects.iterate(): i += 1 # if i > 100: # break if not i % 10000: print dt(), i try: content = PageContent.objects.get(page=page).content except ObjectDoesNotExist: print u"× does not exist" continue # morph = u'сущ' # m = re.search(u'(\{\{(?P<title>гл ru [^|]+)(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)\}\})', content, flags=re.UNICODE + re.DOTALL) # todo: этих блоков (шаблонов словоизменения) может быть несколько!! m = re.search( u"(\{\{(?P<title>(?P<morph>сущ|гл) ru [^|]+)(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)\}\})", content, flags=re.UNICODE + re.DOTALL, ) # print page.pk, m if m: title = m.group("title").strip() morph = m.group("morph") source_params = m.group("params") source_params = re.sub(u"\{\{по[- ]слогам[^}]+\}\}", "", source_params) source_params = source_params.split("|") source_params = map(lambda x: x.strip(), source_params) source_params = filter(len, source_params) # print # print '=' * 20 # print page.title # print title source_params = get_dict_from_lines(source_params) keys |= set(source_params.keys()) # for key, value in source_params.items(): # print key, '=', value # print '-' * 20 try: template = TemplateInflectionData.objects.get(title=title) except ObjectDoesNotExist: # print ('#' * 120 + '\n') * 20 continue # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ") forms = get_dict_from_text(template.forms) params = get_dict_from_text(template.params) # print template.forms # print '-' * 20 # for key, value in forms.items(): # print key, '=', value # print '-' * 20 for param, value in params.items(): params[param] = process_template(value, source_params) # for key, value in params.items(): # print key, '=', value # print '-' * 20 for form, value in forms.items(): value = process_template(value, source_params) forms[form] = divide_words(value) # todo: буду — всегда инфинитив? # todo: не добавлять их в результаты!! future = [ u"буду/будешь… ", u"буду/будешь... ", u"буду, будешь, будем… ", u"буду, будешь, будет… ", u"буду, будешь, будет ", u"будет ", u"будет… ", u"буду ", ] value = value.strip() for prefix in future: if value[: len(prefix)] == prefix: value = value[len(prefix) :] if page.title != value.replace(u"́", ""): pass # print # print page.title # print value.replace(u'́', '') # print repr(page.title) # print repr(value.replace(u'́', '')) sv = "" if morph == u"гл": try: sv = params[u"вид"][0] except KeyError: continue # todo: how it is possible? results = dict() for key, values in forms.items(): # if re.search('[a-z]', ' '.join(values)): #todo! find them! # print values if not values or len(values) == 1 and not values[0]: continue # print key, '=', ' | '.join(values).replace(u'́', '') dest_params = dict() if morph == u"гл": # todo: if "Будущее" -> continue # todo: 123 лицо # todo: род по порядку следования (в прошлом времени) if key in [u"Прич", u"ПричНаст", u"ПричПрош", u"ПричСтрад", u"ПричСтрадПрош"]: # todo: время тоже учитывать? dest_params[u"прич"] = 1 if key in [u"Деепр", u"ДеепрНаст", u"ДеепрПрош", u"ДеепрНастПрош"]: # todo: время тоже учитывать? dest_params[u"деепр"] = 1 if key in [u"Будущее"]: pass if u"(прош.)" in key: dest_params[u"время"] = u"пр" if u"(повел.)" in key: dest_params[u"накл"] = u"п" if u"Мы" in key or u"Вы" in key or u"Они" in key: dest_params[u"число"] = u"мн" if u"Я" in key or u"Ты" in key or u"она" in key: dest_params[u"число"] = u"ед" if u"Я" in key or u"Мы" in key: dest_params[u"лицо"] = u"1" if u"Ты" in key or u"Вы" in key: dest_params[u"лицо"] = u"2" if u"Он" in key: dest_params[u"лицо"] = u"3" if params.get(u"возвратный", ""): dest_params[u"залог"] = u"возвр" if key in [u"Я", u"Мы", u"Ты", u"Вы", u"Он/она/оно", u"Они"]: if sv == u"н": dest_params[u"время"] = u"наст" elif sv == u"с": dest_params[u"время"] = u"буд" else: pass # todo: вопрос - а если там "2"? dest_params[u"вид"] = sv # todo: вопрос - а если там "2"? elif morph == u"сущ": dest_params[u"число"] = u"ед" special_cases = {u"П": u"притяжательного", u"Пр": u"превратительного", u"Сч": u"(счетная форма?)"} if key in special_cases: dest_params[u"падеж"] = special_cases[key] else: cases = { "nom": u"именительного", "gen": u"родительного", "dat": u"дательного", "acc": u"винительного", "ins": u"творительного", "prp": u"предложного", "loc": u"местного", "voc": u"звательного", "prt": u"разделительного", } plurals = ["pl", "pl2"] case, plural = key.split("-") if plural in plurals: dest_params[u"число"] = u"мн" dest_params[u"падеж"] = cases[case] if key == "nom-sg": value = values[0] if page.title != remove_stress(value): pass # ошибки! # print # print page.title # print remove_stress(value) # print repr(page.title) # print repr(remove_stress(value)) for value in values: if remove_stress(value) == page.title: continue results.setdefault(value, list()) results[value].append(dest_params.copy()) exceptions = {u"сущ": [u"падеж"], u"гл": [u"лицо"]} def print_items(items): print for value, items in results.items(): print "— value:", remove_stress(value) for i in range(len(items)): item = items[i] if item: print "— ", i, item.get(u"лицо", "-") else: print "— ", i, item print for value, items in results.items(): # print 'value =', remove_stress(value) for i in range(len(items)): # print i, 'start' # print_items(items) dest_params = items[i] # print ' dest_params', i # print '', get_text_from_dict(dest_params).replace('\n', ' ') for j in range(len(items)): if i == j: continue another_params = items[j] if not another_params: continue # print ' another_params', i # print ' ', get_text_from_dict(another_params).replace('\n', ' ') found = True for key in set(dest_params.keys() + another_params.keys()): if key not in exceptions[morph]: if dest_params.get(key) != another_params.get(key): found = False break if found: if morph == u"гл": if u"лицо" in dest_params: another_params[u"лицо"] += dest_params[u"лицо"] elif morph == u"сущ": another_params[u"падеж"] += ", " + dest_params[u"падеж"] items[i] = None break # print i, 'finish' # print_items(items) for value, items in results.items(): for dest_params in items: if not dest_params: continue form_template = u"{{Форма-%s\n|база=%s\n" % (morph, page.title) if morph == u"гл": if u"лицо" in dest_params: v = dest_params[u"лицо"] if len(v) == 3: if "1" in v and "2" in v and "3" in v: dest_params[u"лицо"] = "123" else: print "### BAD VALUE FOR u'лицо':", v elif len(v) != 1: print "### BAD VALUE FOR u'лицо':", v print page.title print remove_stress(value) print form_template elif morph == u"сущ": pass # todo: отсортировать падежи! for param_name, param_value in dest_params.items(): form_template += "|%s=%s\n" % (param_name, param_value) form_template += "}}"
def process_slovoforms_new(): print "process_slovoforms()" i = 0 db_forms = [] db_counter = 0 for page in Page.objects.iterate(): i += 1 if not i % 1000: print dt(), "processed pages:", i # try: # content = PageContent.objects.get(page=page).content # except ObjectDoesNotExist: # print u'× does not exist' # continue content = page.content p = re.compile( u"""(\{\{ (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+) # заголовок (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?) # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE, ) parts = p.findall(content) # if page.title != u'житься': # continue # if page.title != u'плавни': # continue # print page.title # for part in parts: # print '===' # for i in part: # print i # print '---' # print '===' for part in parts: # print part[0] m = p.search(part[0]) if m: title = m.group("title").strip() empty_templates = [ u"сущ ru m ina", u"сущ ru f ina", u"сущ ru n ina", u"сущ ru m a", u"сущ ru f a", u"сущ ru n a", ] if title in empty_templates: print page.title morph = m.group("morph") # if morph != u'сущ': # continue # print page.title # continue call_params = process_call_params(m.group("params")) try: template = TemplateInflectionData.objects.get(title=title) except ObjectDoesNotExist: continue # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ") tpl_forms = get_dict_from_text(template.forms) tpl_params = get_dict_from_text(template.params) for key, value in tpl_params.items(): tpl_params[key] = universal_process_template(value, call_params) for key, value in tpl_forms.items(): value = universal_process_template(value, call_params) value = process_template(value, tpl_params, key, morph) value = divide_words(value) tpl_forms[key] = value form_results = dict() for key, values in tpl_forms.items(): if morph == u"сущ": if key == "nom-sg": value = values[0] if remove_stress(value) and page.title != remove_stress(value): pass # todo: мсправить их все-таки # print # print "https://ru.wiktionary.org/wiki/%s" % urllib.quote_plus(page.title.encode('utf-8')) # print page.title # print remove_stress(value) # print repr(page.title) # print repr(remove_stress(value)) for key, values in tpl_forms.items(): # if re.search('[a-z]', ' '.join(values)): #todo! find them! # print values if not values or len(values) == 1 and not values[0]: continue form_params = get_form_params(morph, key, tpl_params) if not form_params: continue for value in values: if remove_stress(value) == page.title: continue form_results.setdefault(value, list()) form_results[value].append(form_params.copy()) # todo: если полностью совпали, то тоже удалять join_form_results(morph, form_results) for value, items in form_results.items(): for form_params in items: if not form_params: continue form_template = u"{{Форма-%s\n|база=%s\n" % (morph, page.title) for param_name, param_value in form_params.items(): form_template += "|%s=%s\n" % (param_name, param_value) form_template += "}}" # print remove_stress(value) # print form_template # print # db_form, created = WordForm.objects.get_or_create( db_form = WordForm( title=remove_stress(value), base=page.title, value=value, template=form_template ) # if created: db_forms.append(db_form) db_counter += 1 if len(db_forms) > 1000: WordForm.objects.bulk_create(db_forms) print dt(), "> forms added:", db_counter db_forms = [] WordForm.objects.bulk_create(db_forms)