def get_wiki_from_iso_redirect(): import pywikibot site = pywikibot.Site('en', 'wikipedia') for item in LanguageIsoWiki.objects.all(): content = item.content # print content m = re.search('^#redirect ?\[\[(?P<redirect>[^]]+)\]\]', content, re.IGNORECASE) # if not m: # print content if m: redirect = m.group('redirect') print redirect if item.redirect: print "×", "already exists" continue page = pywikibot.Page(site, redirect) try: content_redirect = page.get(get_redirect=True) content_redirect = remove_utf8mb4(content_redirect) print "→", "ok" except pywikibot.exceptions.NoPage: content_redirect = "" print "#", "doesn't exits!", "#" * 200 item.redirect = redirect item.content_redirect = content_redirect item.save()
def get_or_load(self, title): try: return self.get(title=title) except ObjectDoesNotExist: import pywikibot site = pywikibot.getSite(self.lang, "wikipedia") page = pywikibot.Page(site, title) try: content = page.get(get_redirect=True) content = remove_utf8mb4(content) print "creating NEW wiki:", title wiki = self.create(title=title, exists=True) if page.isRedirectPage(): print " process redirect:", content m = re.search( u"^#(перенаправление|redirect)[:\s]*\[\[(?P<redirect>[^]]+)\]\]", content.strip(), re.IGNORECASE ) if not m: raise Exception("Can't parse redirect") redirect = m.group("redirect") wiki.redirect_to = self.get_or_load(redirect) wiki.save() self.content_model.objects.create(wiki=wiki, content=content) except pywikibot.exceptions.NoPage: print "creating EMPTY wiki:", title wiki = self.create(title=title, exists=False) self.content_model.objects.create(wiki=wiki) return wiki
def get_data(self, item): try: content = item.get(get_redirect=True) content_to_db = remove_utf8mb4(content) edited = aware(convert_wiki_date(item.editTime())) redirect = item.isRedirectPage() return content, content_to_db, edited, redirect except NoPage: return None, None, None, None
def get_eng_languages_from_wiki(): import pywikibot site = pywikibot.Site('en', 'wikipedia') category = pywikibot.Category(site, u"Category:Languages_with_ISO_639-2_code") for item in category.articles(): title = item.title() print title content = item.get() content = remove_utf8mb4(content) LanguageRuWiki.objects.create(ru_cat=title, wiki_lang='en', content=content)
def get_wiki_from_iso(): import pywikibot site = pywikibot.Site('en', 'wikipedia') for item in LanguageIso.objects.all(): lang = item.lang print lang page = pywikibot.Page(site, u"ISO_639:%s" % lang) try: content = page.get(get_redirect=True) content = remove_utf8mb4(content) LanguageIsoWiki.objects.create(lang=lang, content=content) print "→", "created" except pywikibot.exceptions.NoPage: LanguageIsoWiki.objects.create(lang=lang, content="") print "#", "doesn't exits!", "#" * 200
def get_rus_languages_from_wiki(): import pywikibot site = pywikibot.Site('ru', 'wikipedia') category = pywikibot.Category(site, u"Категория:Языки_и_диалекты_по_алфавиту") skip = True for item in category.articles(): title = item.title() if title == u"Готский язык": skip = False if skip: print u'×', title continue print title content = item.get() content = remove_utf8mb4(content) LanguageRuWiki.objects.create(ru_cat=title, content=content)
def get_eng_languages_from_wikt_wiki(): import pywikibot site = pywikibot.Site('en', 'wikipedia') for item in LanguageData.objects.all(): title = item.en_cat print title data, created = LanguageRuWiki.objects.get_or_create(ru_cat=title, wiki_lang='en') if created: page = pywikibot.Page(site, title) try: content = page.get(get_redirect=True) except pywikibot.exceptions.NoPage: print "# doesn't exits!" continue content = remove_utf8mb4(content) data.content = content data.save() print '→ created!' else: print '× already exist'
if not orig_title.startswith(prefix): continue title = orig_title[len(prefix) :] print title if title == u"сущ": continue if not re.match(u"^сущ/[fmn]\d/\d+$", title): continue # print 'ok' # continue old = WordInflectionMassEdit.objects.get(title=title) old_content = old.content new_content = remove_utf8mb4(item.get(get_redirect=True)) edited = aware(convert_wiki_date(item.editTime())) print edited update_content = new_content if new_content != old_content: if u"{{пишу}}" in new_content or u"{{message box" in new_content: print u"×" * 20, u"{{пишу}} detected" first_edited = None break changing_started = False old_items, old_reports = parse_mass_edit(old_content) new_items, new_reports = parse_mass_edit(new_content) if len(old_items) != len(new_items) or set(old_items.keys()) != set(new_items.keys()): print u"×" * 20, "WRONG DIFFERENCE IN TITLES (COUNTS OR VALUES)" continue
def load_missed_pages(): print 'load_missed_pages()' g = AllpagesPageGenerator(start=u"!") i = 0 j = 0 redirects = [] edits = [] contents = [] # pages = [] print dt(), 'starting cycle?' for item in g: j += 1 if not j % 1000: print dt(), 'j =', j if j < 178000: continue #print item title = item.title() if ':' in title: print dt(), ':' * 20, title # continue # print title try: # print dt(), title if title != remove_utf8mb4(title): print dt(), title, '#' * 30, 'UTF8-MB4' continue Page.objects.get(title=title) # except MultipleObjectsReturned: # print '#' * 10, 'MULTIPLE' except ObjectDoesNotExist: print dt(), title, '(ok)' try: content = item.get(get_redirect=True) except NoPage: print dt(), '#' * 30, 'NO_PAGE' continue content = remove_utf8mb4(content) redirect = item.isRedirectPage() # edited = item.editTime() edited = convert_wiki_date(item.editTime()) # edited = make_naive(wiki_page.editTime(), pytz.UTC) edited = make_aware(edited, pytz.UTC) # edited = make_aware(item.editTime(), pytz.UTC) # edited = make_aware(item.editTime(), None) # print edited page = Page.objects.create(title=title) PageContent.objects.create(page=page, content=content) PageRedirect.objects.create(page=page, redirect=redirect) PageEdited.objects.create(page=page, edited=edited) # contents.append(PageContent(page=page, content=content)) # if len(contents) > 100: # PageContent.objects.bulk_create(contents) # contents = [] # print '#' * 30, 'contents added' # # redirects.append(PageRedirect(page=page, redirect=redirect)) # if len(redirects) > 1000: # PageRedirect.objects.bulk_create(redirects) # redirects = [] # print '#' * 30, 'redirects added' # # edits.append(PageEdited(page=page, edited=edited)) # if len(edits) > 1000: # PageEdited.objects.bulk_create(edits) # edits = [] # print '#' * 30, 'edits added' # break i += 1 if not i % 100: print dt(), '*' * 20, i # PageContent.objects.bulk_create(pages) # pages = [] sleep(1)
def update_missed_data(): print 'update_missed_data()' i = 0 redirects = [] edits = [] for page in Page.objects.iterate(): i += 1 if i < 35000: continue if not i % 1000: print dt(), 'i =', i need_content = need_redirect = need_edited = False try: PageContent.objects.get(page=page) except ObjectDoesNotExist: need_content = True try: PageEdited.objects.get(page=page) except ObjectDoesNotExist: need_edited = True try: PageRedirect.objects.get(page=page) except ObjectDoesNotExist: need_redirect = True if need_content or need_edited or need_redirect: print dt(), page.title site = pywikibot.Site('ru') # wiki_page = pywikibot.Page(site, page.title.decode('utf-8')) wiki_page = pywikibot.Page(site, page.title) try: if need_content: content = wiki_page.get(get_redirect=True) content = remove_utf8mb4(content) PageContent.objects.create(page=page, content=content) print '- contents added' if need_redirect: redirect = wiki_page.isRedirectPage() PageRedirect.objects.create(page=page, redirect=redirect) print '- redirect added' # redirects.append(PageRedirect(page=page, # redirect=redirect)) # if len(redirects) > 1000: # PageRedirect.objects.bulk_create(redirects) # redirects = [] # print '#' * 30, 'redirects added' if need_edited: edited = convert_wiki_date(wiki_page.editTime()) # edited = make_naive(wiki_page.editTime(), pytz.UTC) # edited = make_aware(wiki_page.editTime(), None) edited = make_aware(edited, pytz.UTC) PageEdited.objects.create(page=page, edited=edited) print '- edited added' # edits.append(PageEdited(page=page, edited=edited)) # if len(edits) > 1000: # PageEdited.objects.bulk_create(edits) # edits = [] # print '#' * 30, 'edits added' except NoPage: print dt(), '#' * 20, page.title, 'DELETED?' # todo: remove? content = page.content if not need_content else '' redirect = page.page_redirect.redirect if not need_redirect else None edited = page.page_edited.edited if not need_edited else None PageDeleted.objects.create(page=page.pk, title=page.title, content=content, redirect=redirect, edited=edited) # if not need_content: # page.page_content.delete() page.delete() PageRedirect.objects.bulk_create(redirects) PageEdited.objects.bulk_create(edits)
def process_recent(): print 'process_recent()' i = 0 edited = PageEdited.objects.order_by('-edited')[0] print 'updating untill:', edited.edited end = datetime(edited.edited.year, edited.edited.month, edited.edited.day, edited.edited.hour, # edited.edited.minute, ) g = RecentChangesPageGenerator(end=end) for item in g: title = item.title() if ':' in title: continue i += 1 # print title # print repr(title) # print repr(title.encode('utf-8')) # print repr(title.encode('utf-16be')) # print repr(title.encode('utf-32be')) # # print repr(title.encode('utf-16').decode('utf-16')) # page = Page.objects.create(title=title) # page = Page.objects.create(title=title.encode('utf-16be')) # page = Page.objects.create(title=title.encode('utf-16')) # page = Page.objects.create(title=title.encode('utf-8')) # # page = Page.objects.create(title=title.encode('utf-8')) # # page, created = Page.objects.get_or_create(title=title) # continue try: page, created = Page.objects.get_or_create(title=title) except Exception: print dt(), title, '-', '@' * 30, 'bad title' continue try: edited = convert_wiki_date(item.editTime()) edited = make_aware(edited, pytz.UTC) except NoPage: print dt(), title, '-', 'DELETED', '#' * 20 # todo: А если страница еще не была добавлена в БД? Т.е. создание/удаление в пределах одной проверки content, created = PageContent.objects.get_or_create(page=page) edited, created = PageEdited.objects.get_or_create(page=page) redirect, created = PageRedirect.objects.get_or_create(page=page) PageDeleted.objects.create(page=page.pk, title=page.title, content=content.content, edited=edited.edited, redirect=redirect.redirect) page.delete() continue content = item.get(get_redirect=True) content = remove_utf8mb4(content) redirect = item.isRedirectPage() print dt(), edited, '-', i, '-', title, '| pk =', page.pk if created: print '- page CREATED: %s' % page.pk # try: # page = Page.objects.get(title__exact=title) # except ObjectDoesNotExist: # page = Page.objects.create(title=title) data, created = PageContent.objects.get_or_create(page=page) if data.content != content: data.content = content data.save() print '- contents updated' data, created = PageRedirect.objects.get_or_create(page=page) if data.redirect != redirect: data.redirect = redirect data.save() print '- redirect updated' data, created = PageEdited.objects.get_or_create(page=page) if data.edited != edited: data.edited = edited data.save() print '- edited updated'