def fix_for_soshial(): # Убрать НП из ПричСтрад total_count = 0 categories = [ u"Категория:Шаблоны словоизменений/Глаголы/Несовершенный вид", u"Категория:Шаблоны словоизменений/Глаголы/Совершенный вид", ] for category_name in categories: category = pywikibot.Category(site, category_name) for article in category.articles(): title = article.title() # print title content = article.get() p = re.compile(u'(?P<source>(?P<prefix>\|ПричСтрад *= *)\{\{#if:\{\{\{НП\|\}\}\}\|(?P<if_yes>[^|]*)\|(?P<if_no>.*)\}\})', re.UNICODE) if not total_count % 50: print dt(), total_count total_count += 1 m = p.search(content) # if title in [u'Участник:Soshial/sandbox2']: # continue if m: print title # print '=' * 100 print m.group('source').strip() # print '=' * 100 # print content content = p.sub(u'\g<prefix>\g<if_no>', content) # print '=' * 100 # print content # article.put(content, u'Добавление параметра "Инфинитив" к шаблонам (на основании будущей формы)', minorEdit=False) break print total_count
def load_contents(): pages = [] i = 0 j = 0 for page in Page.objects.iterate(): # print dt(), page.title try: # j += 1 # if j > 84000: PageContent.objects.get(page_id=page.id) # print 'exists' except ObjectDoesNotExist: print dt(), page.title # print repr(page.title) title = page.title # try: # title = page.title.encode('utf-8') # except UnicodeDecodeError: # print '#####' # continue # if i > 700: url = "http://dump.a-lib.net/wikt/%s" % urllib.quote_plus(title) url = url.replace('+', '%20') content = urllib.urlopen(url).read() # print repr(content) content = remove_utf8mb4(content.decode('utf-8')) PageContent.objects.create(pk=page.pk, page=page, content=content) # pages.append(PageContent(page=page, content=content)) i += 1 if not i % 100: print dt(), i # PageContent.objects.bulk_create(pages) pages = [] sleep(1)
def action(self, page, **kwargs): content = kwargs['content'] parts = re.findall('(^|\n)(=[^=\n]+=)\n', content) for part in parts: found = part[1] if found in [u'= Буква (латиница) =', u'= Буква (кириллица) =']: continue # print "* [[%s]]: <code><nowiki>%s</nowiki></code>" % (page.title, found) m = re.match(u'^= *\{\{-(?P<lang>[-a-z]+|Праславянский)-(?P<remove>\|([^}]+|\{\{PAGENAME\}\}|))?\}\} *=$', found, re.IGNORECASE) if not m: print found if m: remove = m.group('remove') # if remove: # print page.title, remove if remove == '|nocat': continue lang = m.group('lang') # print "* %s: %s" % (page.title, lang) if lang != 'ru': continue old_header = m.group(0) new_header = "= {{-%s-}} =" % lang if old_header == new_header: continue self.changed += 1 print dt(), 'changed:', self.changed wiki_content = get_wiki_page_content(page.title) new_wiki_content = wiki_content.replace(old_header, new_header) save_wiki_page(page.title, new_wiki_content, "викификация заголовка первого уровня", wait=5)
def perfect_verbs(): total_count = 0 category = pywikibot.Category(site, u"Категория:Шаблоны_словоизменений/Глаголы/Совершенный вид") for article in category.articles(): title = article.title() # print title content = article.get() #p = re.compile(u'(?P<source>\|hide-text=\{\{\{hide-text\|\}\}\})', re.UNICODE) p = re.compile(u'(?P<source>\|hide-text=\{\{\{hide-text\|\}\}\})', re.UNICODE) if not total_count % 50: print dt(), total_count total_count += 1 m = p.search(content) #if title in [u'Участник:Soshial/sandbox2']: # continue if not m: print title #if m: # print title #print m.group('inf').strip() #content = p.sub(u'\g<source>|Инфинитив = \g<inf>', content) #article.put(content, u'Добавление параметра "Инфинитив" к шаблонам (на основании будущей формы)', minorEdit=False) # break print total_count
def find_numeric_params(): print "process_slovoforms()" i = 0 for page in Page.objects.iterate(): i += 1 if not i % 1000: print dt(), "processed pages:", i content = page.content p = re.compile( u"""(\{\{ (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+) # заголовок (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?) # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE, ) parts = p.findall(content) for part in parts: # print part[0] m = p.search(part[0]) if m: title = m.group("title").strip() morph = m.group("morph") call_params, call_numeric = process_call_params(m.group("params")) # for key, value in call_params.items(): # print key, value if call_numeric: print print page.title for numeric in call_numeric: print "-", numeric
def replacement(): total_count = 0 categories = [ u"Категория:Шаблоны_словоизменений/Глаголы/Совершенный вид", u"Категория:Шаблоны_словоизменений/Глаголы/Несовершенный_вид", ] site = pywikibot.Site('ru') for category_name in categories: category = pywikibot.Category(site, category_name) for article in category.articles(): title = article.title() print print title, u'→ get' if title in [u'Участник:Soshial/sandbox2']: continue content = article.get() p = re.compile(u'(?P<source>\|(Я|Ты) \(прош.\) *= *(?P<value>[^|]+))', re.UNICODE) # p = re.compile(u'(?P<source>\|Будущее)', re.UNICODE) if not total_count % 50: print dt(), total_count total_count += 1 parts = p.findall(content) new_content = content for part in parts: # print title, u'→ try' print part[0].strip() new_part = part[0].replace(', ', '<br />') new_content = new_content.replace(part[0], new_part) # content = p.sub(u'\g<source>|Инфинитив = \g<inf>', content) # article.put(content, u'Добавление параметра "Инфинитив" к шаблонам (на основании будущей формы)', minorEdit=False) # break if content != new_content: print u'→ changed!', '!' * 80 article.put(new_content, u'Замена ", " на "<br />" в параметрах "Я (прош.)" и "Ты (прош.)"', minorEdit=False) print total_count
def extract_change_templates(): print "extract_change_templates()" i = 0 data = [] for page in Page.objects.iterate(prefetch=["page_content"]): # for page in Page.objects.iterate(): i += 1 if not i % 1000: print dt(), "processed pages:", i content = page.content p = re.compile( u"""(\{\{ (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+) # заголовок (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?) # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE, ) parts = p.findall(content) # print parts # continue titles = [] for part in parts: m = p.search(part[0]) if m: title = m.group("title").strip() titles.append(title) data.append(Page_ChangeTemplates(page=page, change_templates=" | ".join(titles))) if len(data) > 1000: Page_ChangeTemplates.objects.bulk_create(data) print dt(), "> data added:", len(data) data = [] Page_ChangeTemplates.objects.bulk_create(data)
def get_data_from_that_site(): print "get_data_from_that_site()" i = 0 data = [] for page in Page.objects.iterate(prefetch=["page_changetemplates", "page_starlingzaliznyak"]): i += 1 if not i % 100: print dt(), "processed pages:", i if u"" in page.title: continue change_templates = page.page_changetemplates.change_templates if u"сущ" in change_templates or u"гл" in change_templates: try: page.page_starlingzaliznyak except ObjectDoesNotExist: # print page.title base, info, morph = get_page(page.title) data.append( Page_StarlingZaliznyak( page=page, word=page.title, base=" | ".join(base), info=" | ".join(info), morph=" | ".join(morph), ) ) if len(data) > 1000: Page_StarlingZaliznyak.objects.bulk_create(data) print dt(), "> data added:", len(data) data = [] Page_StarlingZaliznyak.objects.bulk_create(data)
def find_empty_words(): print "process_slovoforms()" i = 0 count = 0 for page in Page.objects.iterate(prefetch=["page_content"]): i += 1 if not i % 1000: print dt(), "processed pages:", i content = page.content p = re.compile( u"""(\{\{ (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+) # заголовок (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?) # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE, ) parts = p.findall(content) for part in parts: m = p.search(part[0]) if m: title = m.group("title").strip() empty_templates = [ u"сущ ru m ina", u"сущ ru f ina", u"сущ ru n ina", u"сущ ru m a", u"сущ ru f a", u"сущ ru n a", ] if title in empty_templates: count += 1 print count, page.title
def get_dictionary_words(dictionary_id): print dt(), 'Loading words (%s) - started' % dictionary_id words = list() for item in Word_Value.objects.filter(dictionary_id=dictionary_id): words.append(item.value) print dt(), 'Loading words - finished' return words
def action(self, page, **kwargs): # print '=' * 80 # print dt(), page.title content = kwargs['content'] m = re.search(u'=== Смотреть также ===\n([^={]*)', content, flags=re.MULTILINE | re.DOTALL) if m: block_content = m.group(1) # print block_content.strip() for remove in removings: if remove in block_content: print '=' * 80 print dt(), page.title print block_content.strip() old_content = get_wiki_page_content(page.title) new_content = re.sub( u'=== Смотреть также ===\n\s*%s\n' % remove.replace('*', r'\*').replace('[', r'\[').replace(']', r'\]'), u'', old_content) if old_content != new_content: desc = u'Удаление "Смотреть также" со списком имён' save_wiki_page(page.title, new_content, desc, wait=5) print 'saved' else: print 'not changed' else: print u'×××'
def download(url): for tries in range(3): try: return urllib.urlopen(url).read() except IOError: print dt(), '#', 'Download failed, tries:', tries time.sleep(5)
def bulk(self, items, model=None, chunk_size=1000): if not model: model = self.model processed = 0 for chunk in chunks(items, chunk_size): processed += len(model.objects.bulk_create(chunk)) print dt(), '-> Processed:', processed
def save_db_to_file(): i = 0 print dt(), i words = dict() ## for item in Word_Description.objects.all()[:10000]: # .prefetch_related('details') ## for item in Word_Description.objects.prefetch_related('word', 'word__details').all()[:10000]: # for item in Word_Description.objects.iterate(100000): output = list() for item in Word_Value.objects.iterate(100000): #for item in Word_Description.objects.iterate(100000, ['word', 'word__details']): ## for item in Word_Description.objects.iterate(100): # word = "%s " % item.value # word = "%s %s" % (item.value, item.word.details.in_academic_lopatin) # # print word # if item.dictionary_id in [12, 13, 16]: # continue i += 1 # print repr(item.value) output.append("%s|%s" % (item.value, item.dictionary_id)) if not i % 10000: print dt(), i # break # if i > 300000: # break save_file('db_data.txt', '\n'.join(output), 'utf-8')
def action(self, page, **kwargs): # print '=' * 80 # print dt(), page.title content = kwargs['content'] m = re.search(u'=== Иноязычные аналоги ===\n(.*)\n=', content, flags=re.MULTILINE | re.DOTALL) if m: block_content = m.group(1) if u'=== Перевод ===' in content: # print u'перевод есть :)' # print '=' * 80 # print dt(), page.title m2 = re.search(u'=== Перевод ===(.*)\n=== Иноязычные аналоги ===\n(.*?)\n=', content, flags=re.MULTILINE | re.DOTALL) if m2: block1 = m2.group(1) block2 = m2.group(2) if "\n=" in block1 or "\n=" in block2: print u'×' * 200 return mb1 = big_empty.search(block1) mb2 = big_empty.search(block2) if mb2: #if mb1 and mb2: # print dt(), page.title # old_content = content # old_content = get_wiki_page_content(page.title) # new_content = old_content.replace( # u'=== Иноязычные аналоги ===\n%s\n' % block2, # u'') # # # new_block2 = re.sub(u'\{\{перев-блок\|*\n', u'{{перев-блок|Иноязычные аналоги|\n', block2) # # new_block1 = big_empty.sub(new_block2, block1) # new_block1 = re.sub(u'\{\{перев-блок\|*\n', u'{{перев-блок|Иноязычные аналоги|\n', block1) # new_content = new_content.replace( # u'=== Перевод ===%s' % block1, # u'=== Перевод ===\n%s\n' % new_block1.strip(), # ) # if old_content != new_content: # # desc = u'Удаление пустого блока "Перевод" и добавление "Иноязычные аналоги"' # desc = u'Удаление пустого блока "Иноязычные аналоги"' # save_wiki_page(page.title, new_content, desc, wait=5) # # self.stop() pass print print '=' * 100 print dt(), page.title print '-' * 100 print block1 print '-' * 100 print block2 print '-' * 100 else: # print dt(), page.title, u'×××', u'неподходящее взаимное расположение' pass else: # print dt(), page.title, u'×××', u'перевода нет :(' pass else: # print dt(), page.title, u'×××', u'нет заголовка после аналогов?' pass
def process_items(self): i = 0 site = pywikibot.Site('ru') print dt(), 'processing some pages in database' for page in Page.objects.filter(title__in=titles): i += 1 item = pywikibot.Page(site, page.title) self.process_item(item, i)
def get_dictionaries_words(dictionaries_data): dictionary_ids = dictionaries_data.keys() print dt(), 'Loading words (%s) - started' % dictionary_ids words = list() for item in Word_Value.objects.filter(dictionary_id__in=dictionary_ids): dictionary_name = dictionaries_data[item.dictionary_id] words.append("%s|%s" % (item.value, dictionary_name)) print dt(), 'Loading words - finished' return words
def process_items(self): print dt(), 'processing all unknown created language pages in database' site = pywikibot.Site('ru') i = 0 for page_created in PageCreated.objects.filter(lang='?'): i += 1 title = page_created.page.title item = pywikibot.Page(site, title) self.process_item(item, i)
def just_go_through(): print "just_go_through()" i = 0 count = 0 for page in Page.objects.iterate(prefetch=["page_content"]): # for page in Page.objects.iterate(): i += 1 if not i % 1000: print dt(), "processed pages:", i content = page.content
def run(self): if self.dont_start(): print dt(), "don't need to start" return self.before() for item in self.iterator(): kwargs = self.get_kwargs(item) self.action(item, **kwargs) if self.stopped: break self.after()
def tpl_action(self, page, tpl, title, morph, lang, params): word = page.title template_title = title.strip() # template title print dt(), word, u'—', template_title, self.groups.setdefault(template_title, list()) self.groups[template_title].append(page.title) kind, gender, num = parse_template_title(template_title) WordInflection.objects.bulk_add( WordInflection(word=word, template=template_title, content=tpl, gender=gender, kind=kind, num=num) )
def get_recent_generator(self): print print '=' * 120 print dt(), 'processing recent pages' print '=' * 120 print end = self.get_end_date() if end: end = datetime(end.year, end.month, end.day, end.hour, end.minute) print 'updating until:', end return RecentChangesPageGenerator(start=self.start_date, end=end)
def run_after_checkers(): s = datetime.now() checkers = [ CfLatinCandidates, CfCaseCandidates, CfEYoCandidates, CfAllCandidates, CfSpecialPages, ] CheckerRunner(checkers).run() print dt(), datetime.now() - s