def load_templates(): for category_name in category_names: category = pywikibot.Category(site, category_name) for article in category.articles(): prefix = u'Шаблон:' title = article.title() if not title.startswith(prefix): print title, '-', 'BAD!', '#' * 40 continue title = title[len(prefix):] content = article.get() print title edited = convert_wiki_date(article.editTime()) edited = make_aware(edited, pytz.UTC) # tmp try: template = TemplateLabel.objects.get(title=title) template.edited = edited template.category = category_name template.content = content template.save() except ObjectDoesNotExist: TemplateLabel.objects.create( title=title, edited=edited, category=category_name, content=content, )
def load_redirects(): prefix = u'Шаблон:' for category_name in category_names: category = pywikibot.Category(site, category_name) for article in category.articles(): redirect = article.title()[len(prefix):] for page in article.backlinks(filterRedirects=True): # print page.title(), '->', article.title() title = page.title() if not title.startswith(prefix): print title, '-', 'BAD!', '#' * 40 continue title = title[len(prefix):] content = page.get(get_redirect=True) print title edited = convert_wiki_date(page.editTime()) edited = make_aware(edited, pytz.UTC) # tmp try: template = TemplateLabel.objects.get(title=title) template.edited = edited template.category = category_name template.content = content template.redirect = redirect template.save() except ObjectDoesNotExist: TemplateLabel.objects.create( title=title, edited=edited, category=category_name, content=content, redirect=redirect, )
def get_data(self, item): try: content = item.get(get_redirect=True) content_to_db = remove_utf8mb4(content) edited = aware(convert_wiki_date(item.editTime())) redirect = item.isRedirectPage() return content, content_to_db, edited, redirect except NoPage: return None, None, None, None
def load_templates_contents(): # for template in TemplateInflection.objects.all(): for template in TemplateInflection.objects.filter(content__isnull=True): title = template.title print title # continue article = pywikibot.Page(site, u"Шаблон:%s" % title) content = article.get() edited = convert_wiki_date(article.editTime()) edited = make_aware(edited, pytz.UTC) template.content = content template.edited = edited template.save()
def load_templates(): category_names = [ u"Категория:Шаблоны словоизменений/Глаголы/Возвратные глаголы", u"Категория:Шаблоны словоизменений/Глаголы/Невозвратные глаголы", u"Категория:Шаблоны словоизменений/Глаголы/Несовершенный вид", u"Категория:Шаблоны словоизменений/Глаголы/Совершенный вид", u"Категория:Шаблоны словоизменений/Существительные/Одушевлённые", u"Категория:Шаблоны словоизменений/Существительные/Одушевлённые/Мужской род", u"Категория:Шаблоны словоизменений/Существительные/Одушевлённые/Женский род", u"Категория:Шаблоны словоизменений/Существительные/Одушевлённые/Средний род", u"Категория:Шаблоны словоизменений/Существительные/Неодушевлённые", u"Категория:Шаблоны словоизменений/Существительные/Неодушевлённые/Мужской род", u"Категория:Шаблоны словоизменений/Существительные/Неодушевлённые/Женский род", u"Категория:Шаблоны словоизменений/Существительные/Неодушевлённые/Средний род", u"Категория:Шаблоны словоизменений/Прилагательные", u"Категория:Шаблоны словоизменений/Причастия", u"Категория:Шаблоны словоизменений/Числительные", u"Категория:Шаблоны словоизменений/Фамилии", u"Категория:Шаблоны словоизменений/Местоимения", ] for category_name in category_names: morph = category_name.split('/')[1] category = pywikibot.Category(site, category_name) for article in category.articles(): prefix = u'Шаблон:' # Шаблон:прил ru title = article.title() if not title.startswith(prefix): print title, '-', 'BAD!' continue title = title[len(prefix):] content = article.get() print title edited = convert_wiki_date(article.editTime()) edited = make_aware(edited, pytz.UTC) # tmp try: TemplateInflection.objects.get(title=title, content=content, edited=edited, morph=morph) print '#' * 20, '-> ALREADY EXISTS' except ObjectDoesNotExist: TemplateInflection.objects.create(title=title, content=content, edited=edited, category=category_name, morph=morph)
def process_potentially_new_item(self, item, i): try: title = item.title() except InvalidTitle: print 'Wrong title', '#' * 120 return if ':' in title: return # try: # page = Page.objects.get(title=title) # except (Page.DoesNotExist, _mysql_exceptions.Warning): # return content, content_to_db, edited, redirect = self.get_data(item) if not edited: return if not self.output_interval or not i % self.output_interval: print dt(), if ':' in title: print ':', else: print '#', print edited, '-', i, '-', if ':' in title: print ':', else: print '#', print transliterate(title), ' // pk=%s' % '?' # page.pk # if item.previous_revision_id == -1 and u'{{-ru-' in content: if u'{{-ru-' in content: created_at = convert_wiki_date(item.oldest_revision.timestamp) print transliterate(title), created_at # if created_at + timedelta(hours=5) > datetime.now(): if created_at + timedelta(days=7) > datetime.now(): if title not in self.titles and item.oldest_revision.user != 'CinBot': print print '-' * 100 print '|', transliterate(item.oldest_revision.user), edited print '|', transliterate(title) print '-' * 100 print self.items.append((title, item.oldest_revision.user, created_at)) self.titles.append(title) if ':' in title: return return edited
def process_template(article, lang): prefix = u'Шаблон:' title = article.title() if not title.startswith(prefix): print title, '-', 'BAD!', 'BAD!', '#' * 100 return title = title[len(prefix):] print title article = pywikibot.Page(site, u"Шаблон:%s" % title) content = article.get() edited = convert_wiki_date(article.editTime()) edited = make_aware(edited, pytz.UTC) words = title.split(' ') morph = words[0] if morph not in [u'adv', u'conj', u'interj', u'гл', u'глагол', u'мест', u'прил', u'сущ', u'числ', u'падежи', u'prep', u'affix', u'intro', u'phrase', u'suffix', u'predic', u'склонение', u'part', u'артикль', u'article', u'арт', u'деепр', u'onomatop', u'interj1', u'прич', u'герундий', u'склон', u'степени', u'междом', u'спряжения', u'спряжение', u'словоизм', u'сущ2', u'принад', u'palat', u'abbrev', u'measure', u'morph', u'prefix', u'ein', u'союз', u'словоформы', u'глаг', u'послел', u'послелог', u'падежи-мест', u'нар', u'морфема', u'межд', ]: print u'm →', title return prefix = u"%s %s" % (morph, lang) if not title.startswith(prefix): print u'e →', title, '(%s)' % lang return info = title[len(prefix):].strip() kind, gender, num = parse_template_title(title) return TemplateInflection( title=title, content=content, edited=edited, lang=lang, morph=morph, info=info, kind=kind, gender=gender, num=num, )
def load_templates(): import pywikibot site = pywikibot.Site('ru') category_names = [ u"Категория:Викисловарь:Шаблоны:Языки", u"Категория:Шаблоны:Языковые_заголовки", ] for category_name in category_names: category = pywikibot.Category(site, category_name) for article in category.articles(): title = article.title() m = re.match(u'^Шаблон:-([^-].*)-$', title) if m: lang = m.group(1) print lang content = article.get(get_redirect=True) edited = aware(convert_wiki_date(article.editTime())) LanguageTemplate.objects.get_or_create(lang_code=lang, edited=edited, content=content)
def load_templates(): import pywikibot site = pywikibot.Site('ru') category_names = [ u"Категория:Шаблоны:Названия_языков", # u"Категория:Шаблоны:Языковые_заголовки", ] for category_name in category_names: category = pywikibot.Category(site, category_name) for article in category.articles(): title = article.title() m = re.match(u'^Шаблон:(?P<case>[Ll])ang-(?P<lang>.+)$', title) # if not m: # print title if m: lang = m.group('lang') case = m.group('case') lower = case == 'l' print lang content = article.get(get_redirect=True) edited = aware(convert_wiki_date(article.editTime())) TemplateLangNames.objects.get_or_create( lang=lang, lower=lower, edited=edited, content=content)
continue title = orig_title[len(prefix) :] print title if title == u"сущ": continue if not re.match(u"^сущ/[fmn]\d/\d+$", title): continue # print 'ok' # continue old = WordInflectionMassEdit.objects.get(title=title) old_content = old.content new_content = remove_utf8mb4(item.get(get_redirect=True)) edited = aware(convert_wiki_date(item.editTime())) print edited update_content = new_content if new_content != old_content: if u"{{пишу}}" in new_content or u"{{message box" in new_content: print u"×" * 20, u"{{пишу}} detected" first_edited = None break changing_started = False old_items, old_reports = parse_mass_edit(old_content) new_items, new_reports = parse_mass_edit(new_content) if len(old_items) != len(new_items) or set(old_items.keys()) != set(new_items.keys()): print u"×" * 20, "WRONG DIFFERENCE IN TITLES (COUNTS OR VALUES)" continue for word in sorted(old_items.keys(), key=lambda x: x[::-1]):
def load_missed_pages(): print 'load_missed_pages()' g = AllpagesPageGenerator(start=u"!") i = 0 j = 0 redirects = [] edits = [] contents = [] # pages = [] print dt(), 'starting cycle?' for item in g: j += 1 if not j % 1000: print dt(), 'j =', j if j < 178000: continue #print item title = item.title() if ':' in title: print dt(), ':' * 20, title # continue # print title try: # print dt(), title if title != remove_utf8mb4(title): print dt(), title, '#' * 30, 'UTF8-MB4' continue Page.objects.get(title=title) # except MultipleObjectsReturned: # print '#' * 10, 'MULTIPLE' except ObjectDoesNotExist: print dt(), title, '(ok)' try: content = item.get(get_redirect=True) except NoPage: print dt(), '#' * 30, 'NO_PAGE' continue content = remove_utf8mb4(content) redirect = item.isRedirectPage() # edited = item.editTime() edited = convert_wiki_date(item.editTime()) # edited = make_naive(wiki_page.editTime(), pytz.UTC) edited = make_aware(edited, pytz.UTC) # edited = make_aware(item.editTime(), pytz.UTC) # edited = make_aware(item.editTime(), None) # print edited page = Page.objects.create(title=title) PageContent.objects.create(page=page, content=content) PageRedirect.objects.create(page=page, redirect=redirect) PageEdited.objects.create(page=page, edited=edited) # contents.append(PageContent(page=page, content=content)) # if len(contents) > 100: # PageContent.objects.bulk_create(contents) # contents = [] # print '#' * 30, 'contents added' # # redirects.append(PageRedirect(page=page, redirect=redirect)) # if len(redirects) > 1000: # PageRedirect.objects.bulk_create(redirects) # redirects = [] # print '#' * 30, 'redirects added' # # edits.append(PageEdited(page=page, edited=edited)) # if len(edits) > 1000: # PageEdited.objects.bulk_create(edits) # edits = [] # print '#' * 30, 'edits added' # break i += 1 if not i % 100: print dt(), '*' * 20, i # PageContent.objects.bulk_create(pages) # pages = [] sleep(1)
def update_missed_data(): print 'update_missed_data()' i = 0 redirects = [] edits = [] for page in Page.objects.iterate(): i += 1 if i < 35000: continue if not i % 1000: print dt(), 'i =', i need_content = need_redirect = need_edited = False try: PageContent.objects.get(page=page) except ObjectDoesNotExist: need_content = True try: PageEdited.objects.get(page=page) except ObjectDoesNotExist: need_edited = True try: PageRedirect.objects.get(page=page) except ObjectDoesNotExist: need_redirect = True if need_content or need_edited or need_redirect: print dt(), page.title site = pywikibot.Site('ru') # wiki_page = pywikibot.Page(site, page.title.decode('utf-8')) wiki_page = pywikibot.Page(site, page.title) try: if need_content: content = wiki_page.get(get_redirect=True) content = remove_utf8mb4(content) PageContent.objects.create(page=page, content=content) print '- contents added' if need_redirect: redirect = wiki_page.isRedirectPage() PageRedirect.objects.create(page=page, redirect=redirect) print '- redirect added' # redirects.append(PageRedirect(page=page, # redirect=redirect)) # if len(redirects) > 1000: # PageRedirect.objects.bulk_create(redirects) # redirects = [] # print '#' * 30, 'redirects added' if need_edited: edited = convert_wiki_date(wiki_page.editTime()) # edited = make_naive(wiki_page.editTime(), pytz.UTC) # edited = make_aware(wiki_page.editTime(), None) edited = make_aware(edited, pytz.UTC) PageEdited.objects.create(page=page, edited=edited) print '- edited added' # edits.append(PageEdited(page=page, edited=edited)) # if len(edits) > 1000: # PageEdited.objects.bulk_create(edits) # edits = [] # print '#' * 30, 'edits added' except NoPage: print dt(), '#' * 20, page.title, 'DELETED?' # todo: remove? content = page.content if not need_content else '' redirect = page.page_redirect.redirect if not need_redirect else None edited = page.page_edited.edited if not need_edited else None PageDeleted.objects.create(page=page.pk, title=page.title, content=content, redirect=redirect, edited=edited) # if not need_content: # page.page_content.delete() page.delete() PageRedirect.objects.bulk_create(redirects) PageEdited.objects.bulk_create(edits)
def process_recent(): print 'process_recent()' i = 0 edited = PageEdited.objects.order_by('-edited')[0] print 'updating untill:', edited.edited end = datetime(edited.edited.year, edited.edited.month, edited.edited.day, edited.edited.hour, # edited.edited.minute, ) g = RecentChangesPageGenerator(end=end) for item in g: title = item.title() if ':' in title: continue i += 1 # print title # print repr(title) # print repr(title.encode('utf-8')) # print repr(title.encode('utf-16be')) # print repr(title.encode('utf-32be')) # # print repr(title.encode('utf-16').decode('utf-16')) # page = Page.objects.create(title=title) # page = Page.objects.create(title=title.encode('utf-16be')) # page = Page.objects.create(title=title.encode('utf-16')) # page = Page.objects.create(title=title.encode('utf-8')) # # page = Page.objects.create(title=title.encode('utf-8')) # # page, created = Page.objects.get_or_create(title=title) # continue try: page, created = Page.objects.get_or_create(title=title) except Exception: print dt(), title, '-', '@' * 30, 'bad title' continue try: edited = convert_wiki_date(item.editTime()) edited = make_aware(edited, pytz.UTC) except NoPage: print dt(), title, '-', 'DELETED', '#' * 20 # todo: А если страница еще не была добавлена в БД? Т.е. создание/удаление в пределах одной проверки content, created = PageContent.objects.get_or_create(page=page) edited, created = PageEdited.objects.get_or_create(page=page) redirect, created = PageRedirect.objects.get_or_create(page=page) PageDeleted.objects.create(page=page.pk, title=page.title, content=content.content, edited=edited.edited, redirect=redirect.redirect) page.delete() continue content = item.get(get_redirect=True) content = remove_utf8mb4(content) redirect = item.isRedirectPage() print dt(), edited, '-', i, '-', title, '| pk =', page.pk if created: print '- page CREATED: %s' % page.pk # try: # page = Page.objects.get(title__exact=title) # except ObjectDoesNotExist: # page = Page.objects.create(title=title) data, created = PageContent.objects.get_or_create(page=page) if data.content != content: data.content = content data.save() print '- contents updated' data, created = PageRedirect.objects.get_or_create(page=page) if data.redirect != redirect: data.redirect = redirect data.save() print '- redirect updated' data, created = PageEdited.objects.get_or_create(page=page) if data.edited != edited: data.edited = edited data.save() print '- edited updated'
def process_item(self, item, i): # todo: create external mechanism of pausing work (actual for big processors) try: title = item.title() except InvalidTitle: print 'Wrong title', '#' * 120 return if ':' in title: # todo: we need this only for RecentProcessor if title.startswith(u"Шаблон:"): # if title.startswith(u"Категория:") or title.startswith(u"Шаблон:"): # print '-' * 40 pass else: return if self.readonly: return self.process_item_readonly(item, title, i) # if Page.objects.filter(title=title): # print dt(), title, '- exists' # return page = self.get_page(title, i) if not page: return content, content_to_db, edited, redirect = self.get_data(item) if not edited: # print dt(), title.encode('cp1251'), '-', 'DELETED', '#' * 20 print dt(), '& PAGE WAS DELETED - %d - & %s // pk=%d' \ % (i, transliterate(title), page.pk) # print dt(), transliterate(title), '-', 'DELETED', '#' * 10 # print dt(), '-', 'DELETED', '#' * 20 page.delete_and_log() return # print dt(), edited, '-', i, '-', title.encode('cp1251'), '| pk =', page.pk if not self.output_interval or not i % self.output_interval: print dt(), if ':' in title: print ':', else: print '#', print edited, '-', i, '-', if ':' in title: print ':', else: print '#', print transliterate(title), ' // pk=%s' % page.pk # print dt(), edited, '-', i, '-', '| pk =', page.pk log = transliterate(title) oldest = next(item.revisions(reverseOrder=True, total=1, content=True)) created_at = aware(convert_wiki_date(oldest.timestamp)) created_author = oldest.user created_lang = '?' if oldest.text is None: created_lang = '??' else: # print oldest.text # print repr(oldest.text) # print m = re.search(u'\{\{-([-\w]+|Праславянский)-(?:\|[^}]+)?\}\}', oldest.text, flags=re.MULTILINE | re.UNICODE) m2 = re.search(u'\{\{заголовок\|(en|tr|jbo|la|it|es|fo|da|de|pt|hr|pl|fi|lv|nl|sv|no|io|gd|az|ms|id|nv|nds|nah|hu|nrm|vls|fur|ga|hu|lb|hsb|li|tpi|gv|fr|cy|fy|sc|fo|zu|sw|mg|oc|ca|qu|ln|eo|so|cs|uz|et|vo|ku|su|sk|mi|kw|bar|br|an|sq|bs|af)\|add=\w*\}\}', oldest.text) m_new = re.search(u'\{\{NEW\|lang=([-a-z]+)\|cat=', oldest.text, flags=re.UNICODE | re.IGNORECASE) if m: created_lang = m.group(1) elif re.search(u'^= *Праславянский', oldest.text, flags=re.MULTILINE | re.UNICODE): created_lang = u'Праславянский' elif re.search(u'=<div style="background-color0?:#\{\{h1c\}\}">Эсперанто</div>=', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = u'eo' elif m2: created_lang = m2.group(1) elif re.search(u'== *\[\[?(:w)?:en:[^|]+\|английский\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'== *\[\[?(:w)?:de:[^|]+\|немецкий\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'de' elif re.search(u'== *\[?\[?(английский|english)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'== *\[?\[?(французский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'fr' elif re.search(u'== *\[?\[?(италь?янский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'it' elif re.search(u'== *\[?\[?(Нидерландский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'nl' elif re.search(u'\{\{(английский|en)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'\{\{(Нидерландский|nl)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'nl' elif re.search(u'\{\{(немецкий|de)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'de' elif re.search(u'\{\{(it)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'it' elif m_new: created_lang = m_new.group(1) elif re.search(u'#(redirect|перенаправление)', oldest.text, flags=re.MULTILINE | re.UNICODE | re.IGNORECASE): created_lang = u'-' else: save_file(settings.FILES_PATH + "/errors/created_lang/%s.txt" % page.pk, oldest.text.encode('utf-8')) # print # print transliterate(title), created_at # print transliterate(oldest.user), transliterate(created_lang) # print self.update_data(page, content, content_to_db, edited, redirect, log, created_at, created_author, created_lang) if ':' in title: return return edited