def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content new_content = content for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if not headers: continue if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers: # print u'# [[{}]] (секция "{}")'.format(title, lang) # print '=' * 120 # print section_content if section_content.strip().startswith(u'<b>'): new_section_content = \ re.sub(u'^\s*<b>', u'\n=== Морфологические и синтаксические свойства ===\n<b>', section_content) new_content = new_content.replace(section_content, new_section_content) elif section_content.strip().startswith(u'{{падежи '): new_section_content = \ re.sub(u'^\s*\{\{падежи ', u'\n=== Морфологические и синтаксические свойства ===\n{{падежи ', section_content) new_content = new_content.replace(section_content, new_section_content) return new_content
def section_action(self, page, lang, section_content): title = page.title if title.startswith('-') or title.endswith('-'): return if title.startswith('*'): return headers = convert_headers(self.get_headers(section_content)) if not headers: return m = re.search(u'\n(=== *Семантические свойства *===\n+' u'==== *Значение *====\n(?P<mining>.*?)' u'==== *Синонимы *====\n(?P<syn>.*?)' u'==== *Антонимы *====\n(?P<ant>.*?)' u'==== *Гиперонимы *====\n(?P<gyper>.*?)' u'==== *Гипонимы *====(?P<gyp>.*?))' u'\n===[^=]', section_content, re.UNICODE | re.DOTALL) if m: mining = m.group('mining').strip() syn = m.group('syn').strip() ant = m.group('ant').strip() gyper = m.group('gyper').strip() gyp = m.group('gyp').strip() m2 = re.search(u"^(?P<first_line># *\{\{(?P<label>(авиац|австрал|автомоб|автомоб. жарг|агрон|алхим|альп|амер|анат|антроп|артилл|археол|архит|астрол|астрон|безл|библейск|биол|биохим|бирж|болг|ботан|браз|бранн|брит|бухг|вет|вин|военн|военн. жарг|вульг|высок|гастрон|генет|геогр|геод|геол|геометр|геофиз|геральд|гидрол|горн|грам|груб|детск|диал|дигорск|дипл|дисфм|доминик|дор|ед. ч|ест|ж.-д|жарг|живоп|зоол|игр|интернет|информ|ион|ирл|ирон|искаж|искусств|исп|истор|исч|ихтиол|йогич|канадск|канц|карт|картеж. жарг|керам|кинол|книжн|комп|комп. жарг|косм|космет|крим|крим. жарг|кубан|кулин|культурол|лес|лингв|лог|матем|машин|мед|металл|метеорол|метон|мех|микол|микробиол|милиц. жарг|минер|мифол|мол|морск|муз|муз. жарг|нар.-поэт|нар.-разг|научн|неисч|нем|неодобр|неодуш|неол|неофиц|неперех|неправ|нескл|нефтегаз|нидерл|нов.-зел|нумизм|образ|обсц|одобр|одуш|океан|оккульт|опт|орнитол|оскорб|офиц|охотн|палеонт|паразит|парикмах|перен|перех|плотн|полигр|полит|полит. жарг|полиц. жарг|порт|портн|поэт|презр|пренебр|прогр|прост|проф|психиатр|психол|публиц|разг|редк|рекл|религ|ритор|рыбол|с.-х|сад|сексол|сниж|собир|совет|социол|спелеол|спец|спорт|старин|стат|статив|стекловарн|стих|столярн|строит|студ. жарг|тайв|театр|текст|техн|техн. жарг|тж|типогр|тлв|торг|торж|трад.-поэт|тракторн|трансп|уважит|укр|управл|усеч|устар|фам|фант|фарм|физ|физиол|филат|филол|филос|фин|фолькл|фотогр|хим|хоз|хореогр|худ.пр|худож|церк|церк.-слав|цирк|цитол|шахм|швейн|школьн|шотл|шутл|эвф|экол|экон|эл.-техн|эл.-энерг|энтомол|эол|этногр|этнолог|ювел|юр)" u"\.)(\|(?P<lang>[a-z]{2,3}))\}\} *" u"(?P<mining>[^{}'<>\n]*) *(?P<example>\{\{пример\|[^}]*\}\}))\n#$", mining) if not m2: return # print '=' * 40 # print m.group(1).strip() # print '-' * 40 # print mining # print '-' * 40 # print m2.group(0) # print '=' * 40 for block in [syn, ant, gyper, gyp]: # if not re.search("^#[^{}'<>\n]*\n#$", block): # if not re.search("^#\s*\n#$", block) \ # and not re.search("^# *(\[\[[^]]+\]\]([,;] )?)+\n#$", block) : if not re.search("^# *(?P<value>(\[\[[^]]+\]\]([,;] )?)*)\n#$", block) : return # if m2.group('lang') != lang: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, m2.group('first_line')) # # raise Exception(title) print '=' * 40 print m.group(1).strip() print '-' * 40
def section_action(self, page, lang, section_content): title = page.title if title.startswith('-') or title.endswith('-'): return if title.startswith('*'): return headers = convert_headers(self.get_headers(section_content)) if not headers: return if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers: print u'# [[{}]] (секция "{}")'.format(title, lang)
def section_action(self, page, lang, section_content): title = page.title if title.startswith('-') or title.endswith('-'): return if title.startswith('*'): return headers = convert_headers(self.get_headers(section_content)) if not headers: return p = re.compile(u'\n(=== *Семантические свойства *===\n+' u'==== *Значение *====\n\s*(?P<first_line># *\{\{(?P<label>(авиац|австрал|автомоб|автомоб. жарг|агрон|алхим|альп|амер|анат|антроп|артилл|археол|архит|астрол|астрон|безл|библейск|биол|биохим|бирж|болг|ботан|браз|бранн|брит|бухг|вет|вин|военн|военн. жарг|вульг|высок|гастрон|генет|геогр|геод|геол|геометр|геофиз|геральд|гидрол|горн|грам|груб|детск|диал|дигорск|дипл|дисфм|доминик|дор|ед. ч|ест|ж.-д|жарг|живоп|зоол|игр|интернет|информ|ион|ирл|ирон|искаж|искусств|исп|истор|исч|ихтиол|йогич|канадск|канц|карт|картеж. жарг|керам|кинол|книжн|комп|комп. жарг|косм|космет|крим|крим. жарг|кубан|кулин|культурол|лес|лингв|лог|матем|машин|мед|металл|метеорол|метон|мех|микол|микробиол|милиц. жарг|минер|мифол|мол|морск|муз|муз. жарг|нар.-поэт|нар.-разг|научн|неисч|нем|неодобр|неодуш|неол|неофиц|неперех|неправ|нескл|нефтегаз|нидерл|нов.-зел|нумизм|образ|обсц|одобр|одуш|океан|оккульт|опт|орнитол|оскорб|офиц|охотн|палеонт|паразит|парикмах|перен|перех|плотн|полигр|полит|полит. жарг|полиц. жарг|порт|портн|поэт|презр|пренебр|прогр|прост|проф|психиатр|психол|публиц|разг|редк|рекл|религ|ритор|рыбол|с.-х|сад|сексол|сниж|собир|совет|социол|спелеол|спец|спорт|старин|стат|статив|стекловарн|стих|столярн|строит|студ. жарг|тайв|театр|текст|техн|техн. жарг|тж|типогр|тлв|торг|торж|трад.-поэт|тракторн|трансп|уважит|укр|управл|усеч|устар|фам|фант|фарм|физ|физиол|филат|филол|филос|фин|фолькл|фотогр|хим|хоз|хореогр|худ.пр|худож|церк|церк.-слав|цирк|цитол|шахм|швейн|школьн|шотл|шутл|эвф|экол|экон|эл.-техн|эл.-энерг|энтомол|эол|этногр|этнолог|ювел|юр)' u'\.)(\|(?P<lang>[a-z]{2,3}))\}\} *' u"(?P<mining>[^{}'<>\n]*) *(?P<example>\{\{пример\|[^}]*\}\}))\n#\s*" u'==== *Синонимы *====\n\s*# *(?P<syn>(\[\[[^]]+\]\]([,;] )?)*)\n#\s*' u'==== *Антонимы *====\n\s*# *(?P<ant>(\[\[[^]]+\]\]([,;] )?)*)\n#\s*' u'==== *Гиперонимы *====\n\s*# *(?P<hyper>(\[\[[^]]+\]\]([,;] )?)*)\n#\s*' u'==== *Гипонимы *====\s*# *(?P<hyp>(\[\[[^]]+\]\]([,;] )?)*)\n#\s*)' u'\n===(?P<tail>[^=])', re.UNICODE | re.DOTALL) m = p.search(section_content) if m: old = m.group(1) new = p.sub(u""" === Семантические свойства === # {{значение |определение = \g<mining> |пометы = [\g<label>] |примеры = \g<example> |синонимы = \g<syn> |конверсивы = |антонимы = \g<ant> |гиперонимы = \g<hyper> |гипонимы = \g<hyp> |согипонимы = |холонимы = |меронимы = |управление = |категории = |якорь = |язык = \g<lang> }}') """, u"\n{}\n=== ".format(old)) print title
def section_action(self, page, lang, section_content): super(NoMorphology, self).section_action(page, lang, section_content) title = page.title if title.startswith('-') or title.endswith('-'): return if title.startswith('*'): return if lang in ['INT', 'mul', 'Zmth', 'Hani', 'hani', 'hanzi']: return headers = convert_headers(self.get_headers(section_content)) if not headers: return if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers: append_dict_list(self.no_morphology[u'Все результаты'], title, lang) if section_content.strip().startswith(u'Существительное') \ or section_content.strip().startswith(u'Прилагательное') \ or section_content.strip().startswith(u'Глагол') \ or section_content.strip().startswith(u'Наречие'): append_dict_list(self.no_morphology[u'Часть речи'], title, lang) elif re.search('^\{\{(сущ|прил|гл|adv|падежи|нар|interj) ', section_content.strip(), re.UNICODE): append_dict_list(self.no_morphology[u'Шаблон часть речи'], title, lang) elif section_content.strip().startswith(u'{{Форма'): append_dict_list(self.no_morphology[u'Шаблон форма'], title, lang) elif section_content.strip().startswith(u'{{длина слова'): append_dict_list(self.no_morphology[u'Шаблон длина слова'], title, lang) elif section_content.strip().startswith(u'<b>'): append_dict_list(self.no_morphology[u'Жирность1'], title, lang) elif section_content.strip().startswith(u"'''"): append_dict_list(self.no_morphology[u'Жирность2'], title, lang) elif u'Тип и синтаксические свойства сочетания' in section_content: append_dict_list(self.no_morphology[u'Словосочетания'], title, lang) elif section_content.strip().startswith(u'==='): append_dict_list(self.no_morphology[u'Пусто'], title, lang) else: append_dict_list(self.no_morphology[u'Остальное'], title, lang) if settings.ALLOW_CYR_PRINT: print u'absent # [[{}]] (секция "{}")'.format(title, lang)
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass # elif headers == template: # # print 'OK' # pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True # if absent_semantic_headers or wrong_order_error: # # print u'{} #{} {}'.format(title, lang, data['header2']) # # print '\n'.join(headers) # # if absent: # # print "\n".join([u"{} -> ABSENT WARNING".format(header) # # for header in absent]) # # if wrong_order_error: # # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # # if unknown_headers: # # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # # for header in unknown_headers]) # # print # pass # elif unknown_headers: # pass # else: if True: m = re.search(u'(==== *Значение *==== *(.*?)' u'\n)===', section_content, re.UNICODE | re.DOTALL) if not m: # print '#' * 100 # print u'title={}, lang={}'.format(title, lang) # print '#' * 100 continue # raise Exception(u'title={}, lang={}'.format(title, lang)) semantic_section = m.group(1) new_semantic_section = semantic_section mining = m.group(2) # if mining.strip() == u'[[]]\n{{Нужен перевод}}': # print section_content # print '=' * 120 has_strange = False for line in mining.split('\n'): line = line.strip() # items = re.findall('\{\{[^}]+\}\}', line) # for item in items: # if not item.startswith(u'{{пример|'): # print item # items = re.findall(u'\{\{помета\|[^}]+\}\}', line) # items = re.findall(u'\{\{помета\|[^}|]*\|[^}]*\}\}', line) # for item in items: # print item # items = re.findall(u'\{\{спорт.\|[^}]*вид[^}]*\}\}', line) # for item in items: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, item) # items = re.findall(u'\{\{субстантивир\.\|[^}]*\|[^}]*\}\}', line) # for item in items: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, item) # items = re.findall(u'\{\{ласк\..*\}\}', line) # for item in items: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, item) if line.startswith('#'): print line if not line.strip(): continue if re.match('^#', line): continue if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line, re.UNICODE): continue if re.match(u'^\{\{(длина слова|илл\.?)\|[^}]+\}\}$', line, re.UNICODE): # if re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line, # re.UNICODE): # if re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line, # re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue # todo: [[Файл: и прочие IMG if re.match(u'^\[\[\]\]$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод *(\|\w+)?\}\}$', line, re.UNICODE): continue ok = False if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True if ok: # new_semantic_section = \ # new_semantic_section.replace( # u"\n{}\n".format(line), # u'\n# {}\n'.format(line) # ) continue has_strange = True if lang == 'la': # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "''" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "<i>" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "{" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "[" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass else: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue # if not has_strange: # new_content = new_content.replace(semantic_section, # new_semantic_section) return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: header2 = data['header2'] if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: print header2, '==', '#' * 120 return content # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) c = 0 for header in headers: if header == u'==== Значение ====': c += 1 if c > 1: print title, '$' * 200 return content # raise Exception('c > 1') if u'=== Семантические свойства ===' not in headers \ and u'==== Значение ====' in headers: # print title, '/', lang # print '\n'.join(headers) page_content = page.content lst = re.findall(u'==== Значение ====', page_content) # if len(lst) == 1: # print title if len(lst) > 1: new_section_content = section_content.replace( u'\n==== Значение ====\n', u'\n=== Семантические свойства ===\n\n==== Значение ====\n', ) new_content = new_content.replace(section_content, new_section_content) return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content # print page.title, '=' * 100 # print content # print '=' * 100 parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True if absent_semantic_headers or wrong_order_error: # print u'{} #{} {}'.format(title, lang, data['header2']) # print '\n'.join(headers) # if absent: # print "\n".join([u"{} -> ABSENT WARNING".format(header) # for header in absent]) # if wrong_order_error: # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # if unknown_headers: # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # for header in unknown_headers]) # print pass elif unknown_headers: pass else: m = re.search(u'(==== *Значение *==== *\n(.*?)' u'==== *Синонимы *==== *\n(.*?)' u'==== *Антонимы *==== *\n(.*?)' u'==== *Гиперонимы *==== *\n(.*?)' u'==== *Гипонимы *==== *(.*?)' u'\n)===[^=]', section_content, re.UNICODE | re.DOTALL) if not m: print '#' * 200 print u'title={}, lang={}'.format(title, lang) print '#' * 200 continue # raise Exception(u'title={}, lang={}'.format(title, lang)) # print title, '|', lang semantic_section = m.group(1) new_semantic_section = semantic_section # print '=' * 40 # print semantic_section # print '-' * 40 mining = m.group(2)#.strip().split('\n') # mining = filter(lambda x: x not in ['#', '# '], mining) # mining_len = len(mining) mining_len = 0 for line in mining.split('\n'): line = line.strip() if not line.strip(): continue if line.strip() in ['#']: continue if re.match('^#', line): mining_len += 1 continue if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line, re.UNICODE): continue if re.match(u'^\{\{(длина слова|илл)\|[^}]+\}\}$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue mining_len += 1 if re.match(u'^\[\[\]\]$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод(\|\w+)?\}\}$', line, re.UNICODE): continue ok = False if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True if ok: # new_semantic_section = \ # new_semantic_section.replace( # u"\n{}\n".format(line), # u'\n# {}\n'.format(line) # ) continue has_strange = True print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ format(title, lang, line) # print line if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue onim_lens = [0, 0, 0, 0] bodies = [m.group(3), m.group(4), m.group(5), m.group(6), ] for i, body in enumerate(bodies): # onim_lens[i] = len(lines) # print body body = body.strip() lines = body.split('\n') fake = False for line in lines: if not line.strip(): continue if line.strip() in ['#']: continue onim_lens[i] += 1 if re.match('^#', line): continue # print title, '|', lang, u' -> "{}"'.format(line) if re.match('^[*:]', line): continue if line in ['-', '?', ]: continue if re.match('^\[\[[^]]+\]\]$', line): continue if re.match('^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$', line): continue if re.match(u'^[a-zа-я !]+$', line, re.UNICODE | re.IGNORECASE): pass if re.match(u'^([a-zа-я !]+([,;] )?)+$', line, re.UNICODE | re.IGNORECASE): pass fake = True # print title, '|', lang # print '->', line # print line # break for i, onim_len in enumerate(onim_lens): if onim_len > mining_len: onim_type = [u'синонимов', u'антонимов', u'гиперонимов', u'гипонимов', ] print u"# [[{}]] (секция \"{}\"): '''{}''' значений, '''{}''' {}".\ format(title, lang, mining_len, onim_len, onim_type[i]) return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass # elif headers == template: # # print 'OK' # pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True # if absent_semantic_headers or wrong_order_error: # # print u'{} #{} {}'.format(title, lang, data['header2']) # # print '\n'.join(headers) # # if absent: # # print "\n".join([u"{} -> ABSENT WARNING".format(header) # # for header in absent]) # # if wrong_order_error: # # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # # if unknown_headers: # # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # # for header in unknown_headers]) # # print # pass # elif unknown_headers: # pass # else: if title in [u'օժանդակ բայ']: return content if True: m = re.search(u'(==== *Значение *==== *(.*?)' u'\n)=', # u'\n)===', section_content, re.UNICODE | re.DOTALL) if not m: continue # raise Exception(u'title={}, lang={}'.format(title, lang)) semantic_section = m.group(1) new_semantic_section = semantic_section mining = m.group(2) # if mining.strip() == u'[[]]\n{{Нужен перевод}}': # print section_content # print '=' * 120 has_strange = False for line in mining.split('\n'): line = line.strip() if not line.strip(): continue if re.match('^#', line): continue if "''" in line: new_line = \ re.sub(u"''(авиац|австрал|автомоб|автомоб. жарг|агрон|алхим|альп|амер|анат|антроп|артилл|археол|архит|астрол|астрон|безл|библейск|биол|биохим|бирж|болг|ботан|браз|бранн|брит|бухг|вет|вин|военн|военн. жарг|вульг|высок|гастрон|генет|геогр|геод|геол|геометр|геофиз|геральд|гидрол|горн|грам|груб|детск|диал|дигорск|дипл|дисфм|доминик|дор|ед. ч|ест|ж.-д|жарг|живоп|зоол|игр|интернет|информ|ион|ирл|ирон|искаж|искусств|исп|истор|исч|ихтиол|йогич|канадск|канц|карт|картеж. жарг|керам|кинол|книжн|комп|комп. жарг|косм|космет|крим|крим. жарг|кубан|кулин|культурол|лес|лингв|лог|матем|машин|мед|металл|метеорол|метон|мех|микол|микробиол|милиц. жарг|минер|мифол|мол|морск|муз|муз. жарг|нар.-поэт|нар.-разг|научн|неисч|нем|неодобр|неодуш|неол|неофиц|неперех|неправ|нескл|нефтегаз|нидерл|нов.-зел|нумизм|образ|обсц|одобр|одуш|океан|оккульт|опт|орнитол|оскорб|офиц|охотн|палеонт|паразит|парикмах|перен|перех|плотн|полигр|полит|полит. жарг|полиц. жарг|порт|портн|поэт|презр|пренебр|прогр|прост|проф|психиатр|психол|публиц|разг|редк|рекл|религ|ритор|рыбол|с.-х|сад|сексол|сниж|собир|совет|социол|спелеол|спец|спорт|старин|стат|статив|стекловарн|стих|столярн|строит|студ. жарг|тайв|театр|текст|техн|техн. жарг|тж|типогр|тлв|торг|торж|трад.-поэт|тракторн|трансп|уважит|укр|управл|усеч|устар|фам|фант|фарм|физ|физиол|филат|филол|филос|фин|фолькл|фотогр|хим|хоз|хореогр|худ.пр|худож|церк|церк.-слав|цирк|цитол|шахм|швейн|школьн|шотл|шутл|эвф|экол|экон|эл.-техн|эл.-энерг|энтомол|эол|этногр|этнолог|ювел|юр)\.''", u'{{{{\\1.|{}}}}}'.format(lang), line) if line != new_line: if "''" not in new_line: new_line = u'# ' + new_line print line print new_line print new_semantic_section = \ new_semantic_section.replace(line, new_line) if "{" in line or "''" in line or "<i>" in line or line.startswith("|"): continue # if re.match('^\*', line): # new_semantic_section = \ # new_semantic_section.replace( # u"\n{}\n".format(line), # u'\n#{}\n'.format(line[1:]) # ) if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line, re.UNICODE): continue # if re.match(u'^\{\{(длина слова|илл\.?)\|[^}]+\}\}$', line, # re.UNICODE): if re.match(u'^\[\[\]\]$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод *(\|\w+)?\}\}$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue if re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue if re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue if re.match(u'^\[\[(Файл:|File:|Image:|Изображение:)[^]]+\]\]$', line, re.UNICODE): # todo: [[Файл: и прочие IMG # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue ok = False # if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, # re.UNICODE): # ok = True if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( *\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True # if not re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE) \ # and re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) if ok: # new_semantic_section = \ # new_semantic_section.replace( # u"\n{}\n".format(line), # u'\n# {}\n'.format(line) # ) continue has_strange = True if lang == 'la': # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "''" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "<i>" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif line.startswith(u'}} {{пример') \ or line.startswith(u'{{списки семантических связей')\ or line.startswith(u'|'): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "{" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "[" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif u'Аналогично русскому' in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass else: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue # if not has_strange: # new_content = new_content.replace(semantic_section, # new_semantic_section) new_content = new_content.replace(semantic_section, new_semantic_section) return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith("-") or title.endswith("-"): return content if title.startswith("*"): return content new_content = content # print page.title, '=' * 100 # print content # print '=' * 100 parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [{"header2": "", "content": parts.pop(0)}] sections += [{"header2": part[0], "content": part[1]} for part in chunks(parts, 2)] for data in sections: # print '-' * 80 header2 = data["header2"] # print header2 if not header2: continue p = re.compile(u"^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$", re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == "ru": if " " in title: template = templates["ru"]["phrase"] else: template = templates["ru"]["word"] else: if " " in title: template = templates["xx"]["phrase"] else: template = templates["xx"]["word"] for data in sections: section_content = data["content"] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True if absent_semantic_headers or wrong_order_error: # print u'{} #{} {}'.format(title, lang, data['header2']) # print '\n'.join(headers) # if absent: # print "\n".join([u"{} -> ABSENT WARNING".format(header) # for header in absent]) # if wrong_order_error: # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # if unknown_headers: # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # for header in unknown_headers]) # print pass elif unknown_headers: pass else: m = re.search( u"(==== *Значение *==== *\n(.*?)" u"==== *Синонимы *==== *\n(.*?)" u"==== *Антонимы *==== *\n(.*?)" u"==== *Гиперонимы *==== *\n(.*?)" u"==== *Гипонимы *==== *(.*?)" u"\n)===[^=]", section_content, re.UNICODE | re.DOTALL, ) if not m: print "#" * 200 print u"title={}, lang={}".format(title, lang) print "#" * 200 # continue raise Exception(u"title={}, lang={}".format(title, lang)) # print title, '|', lang semantic_section = m.group(1) new_semantic_section = semantic_section # print '=' * 40 # print semantic_section # print '-' * 40 mining = m.group(2) for line in mining.split("\n"): line = line.strip() if not line.strip(): continue if re.match("^#", line): continue if re.match(u"^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$", line, re.UNICODE): continue if re.match(u"^\{\{(длина слова|илл\.?)\|[^}]+\}\}$", line, re.UNICODE): print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.format(title, lang, line) continue if re.match(u"^(\[\[[^]]*\]\] )?\{\{Нужен перевод(\|\w+)?\}\}$", line, re.UNICODE): continue if re.match(u"^\[\[\]\]$", line, re.UNICODE): continue if re.match(u"^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE): continue if re.match( u"^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE ): continue # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) # print line if re.match(u"^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE): continue if re.match( u"^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE, ): continue bodies = [m.group(3), m.group(4), m.group(5), m.group(6)] for body in bodies: lines = body.split("\n") fake = False for line in lines: if not line.strip(): continue if re.match("^#", line): continue # print title, '|', lang, ' -> ', line if re.match("^[*:]", line): new_semantic_section = new_semantic_section.replace( u"\n{}\n".format(line), u"\n#{}\n".format(line[1:]) ) elif not line.startswith("<!--"): new_semantic_section = new_semantic_section.replace( u"\n{}\n".format(line), u"\n# {}\n".format(line) ) if re.match("^[*:]", line): continue if line in ["-", "?"]: continue if re.match("^\[\[[^]]+\]\]$", line): continue if re.match("^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$", line): continue if re.match(u"^[a-zа-я !]+$", line, re.UNICODE | re.IGNORECASE): pass if re.match(u"^([a-zа-я !]+([,;] )?)+$", line, re.UNICODE | re.IGNORECASE): pass fake = True # print title, '|', lang # print '->', line # print line break return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content # print page.title, '=' * 100 # print content # print '=' * 100 parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True if absent_semantic_headers or wrong_order_error: # print u'{} #{} {}'.format(title, lang, data['header2']) # print '\n'.join(headers) # if absent: # print "\n".join([u"{} -> ABSENT WARNING".format(header) # for header in absent]) # if wrong_order_error: # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # if unknown_headers: # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # for header in unknown_headers]) # print pass else: m = re.search(u'==== *Значение *==== *\n(.*?)' u'==== *Синонимы *==== *\n(.*?)' u'==== *Антонимы *==== *\n(.*?)' u'==== *Гиперонимы *==== *\n(.*?)' u'==== *Гипонимы *==== *(.*?)' u'\n===[^=]', section_content, re.UNICODE | re.DOTALL) if not m: # print title, '|', lang, '=' * 40 # print section_content # print '-' * 80 if lang == 'ru': if ' ' in title: tail_contains = "\n".join([ template_contents[u'Этимология/phrase'], template_contents[u'Перевод'], template_contents[u'Библиография'], ]) else: tail_contains = "\n".join([ template_contents[u'Родственные слова'], template_contents[u'Этимология/ru'], template_contents[u'Фразеологизмы'], template_contents[u'Перевод'], template_contents[u'Библиография'], ]) else: if ' ' in title: tail_contains = "\n".join([ template_contents[u'Этимология/phrase'], template_contents[u'Библиография'], ]) else: tail_contains = "\n".join([ template_contents[u'Родственные слова'], template_contents[u'Этимология/xx'].format(lang), template_contents[u'Фразеологизмы'], template_contents[u'Библиография'], ]) p = re.compile(u'(==== *Гипонимы *====\n[^[{]*)') m2 = re.search(u'(==== *Гипонимы *====(.*))', section_content, flags=re.DOTALL | re.UNICODE) if m2: if '# [' in m2.group(1): print title, '%' * 200 print m2.group(1) continue else: print title, '!' * 100 new_section_content = p.sub('\\1' + '\n' + tail_contains + '\n', section_content) new_section_content = new_section_content.replace('\n\n\n', '\n\n') new_content = new_content.replace(section_content, new_section_content) # print new_content # print '-' * 120 # print '\n'.join(headers) # print '-' * 120 # print section_content # print '-' * 120 # print # if has_unknown_header or wrong_order: # return content # if u'=== Морфологические и синтаксические свойства ===' in absent: # # print u'{} #{}'.format(title, lang) # # print '\n'.join(headers) # # print # return content # if absent and headers and absent[0] == headers[0]: # # print u'{} #{}'.format(title, lang) # # print '\n'.join(headers) # # print # return content # print '-' * 40 return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith("-") or title.endswith("-"): return content if title.startswith("*"): return content new_content = content # print title, lang # if title == u'высокопарность': # print 'ok' # print page.title, '=' * 100 # print content # print '=' * 100 parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [{"header2": "", "content": parts.pop(0)}] sections += [{"header2": part[0], "content": part[1]} for part in chunks(parts, 2)] for data in sections: # print '-' * 80 header2 = data["header2"] # print header2 if not header2: continue p = re.compile(u"^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$", re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == "ru": if " " in title: template = templates["ru"]["phrase"] else: template = templates["ru"]["word"] else: if " " in title: template = templates["xx"]["phrase"] else: template = templates["xx"]["word"] for data in sections: section_content = data["content"] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True if absent_semantic_headers or wrong_order_error: # print u'{} #{} {}'.format(title, lang, data['header2']) # print '\n'.join(headers) # if absent: # print "\n".join([u"{} -> ABSENT WARNING".format(header) # for header in absent]) # if wrong_order_error: # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # if unknown_headers: # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # for header in unknown_headers]) # print pass elif unknown_headers: pass else: m = re.search( u"(==== *Значение *==== *\n(.*?)" u"==== *Синонимы *==== *\n(.*?)" u"==== *Антонимы *==== *\n(.*?)" u"==== *Гиперонимы *==== *\n(.*?)" u"==== *Гипонимы *==== *(.*?)" u"\n)===[^=]", section_content, re.UNICODE | re.DOTALL, ) if not m: # print title, '|', lang, '=' * 120 # print '\n'.join(headers) # print # raise Exception('!!!') continue # print title, '|', lang semantic_section = m.group(1) new_semantic_section = semantic_section # print '=' * 40 # print semantic_section # print '-' * 40 bodies = [m.group(3), m.group(4), m.group(5), m.group(6)] for body in bodies: # print body # body = body.strip() lines = body.split("\n") fake = False for line in lines: if not line.strip(): continue if re.match("^#", line): continue # print title, '|', lang, u' -> "{}"'.format(line) if re.match("^[*:]", line): new_semantic_section = new_semantic_section.replace( u"\n{}\n".format(line), u"\n#{}\n".format(line[1:]) ) elif not line.startswith("<!--"): new_semantic_section = new_semantic_section.replace( u"\n{}\n".format(line), u"\n# {}\n".format(line) ) if re.match("^[*:]", line): continue if line in ["-", "?"]: continue if re.match("^\[\[[^]]+\]\]$", line): continue if re.match("^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$", line): continue # if re.match(u'^[a-zа-я !]+$', line, re.UNICODE | re.IGNORECASE): # pass # if re.match(u'^([a-zа-я !]+([,;] )?)+$', line, re.UNICODE | re.IGNORECASE): # pass fake = True print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.format(title, lang, line) # print title, '|', lang # print '->', line # print line break # if not fake: # or True: # if semantic_section != new_semantic_section: # print title, '|', lang # print '=' * 100 # print semantic_section # print '-' * 100 # print new_semantic_section # print '-' * 100 # print # new_content = \ # new_content.replace(semantic_section, # new_semantic_section) # if fake and body: # print title, '|', lang # print '=' * 120 # # print '"{}"'.format(tail) # print body # print '-' * 120 # print # if has_unknown_header or wrong_order: # return content # if u'=== Морфологические и синтаксические свойства ===' in absent: # # print u'{} #{}'.format(title, lang) # # print '\n'.join(headers) # # print # return content # if absent and headers and absent[0] == headers[0]: # # print u'{} #{}'.format(title, lang) # # print '\n'.join(headers) # # print # return content # print '-' * 40 return new_content