def get_page(self, title, i): try: page, created = Page.objects.get_or_create(title=title) if created: page_created.send(page) print dt(), '& PAGE WAS CREATED - %d - & %s // pk=%d' \ % (i, transliterate(title), page.pk) return page except Exception: # todo: save them as repr(string) in this case? # print dt(), title.encode('cp1251'), '-', '@' * 30, 'bad title' print dt(), transliterate(title), '-', '@' * 30, 'bad title' # print dt(), '-', '@' * 30, 'bad title' return None
def process_item_readonly(self, item, title, i): content, content_to_db, edited, redirect = self.get_data(item) if not edited: # print dt(), item.title().encode('cp1251'), '-', 'DELETED', '#' * 20 print dt(), '& PAGE WAS DELETED - %d - & %s // pk=%d' \ % (i, transliterate(title), 0) print >> sys.stderr, dt(), '&&& Deleted: %s // pk=%d' \ % (transliterate(title), 0) # print dt(), '-', 'DELETED', '#' * 20 return # print dt(), edited, '-', i, '-', item.title().encode('cp1251') print dt(), edited, '-', i, '-'#, item.title().encode('cp1251') if ':' in title: return return edited
def remove_mistake(self, page): try: mistake = self.mistakes.get(word=page.title) mistake.delete() except ObjectDoesNotExist: # такое, например, получилось, когда старинца была сначала удалена, # а потом защищена (наверное, от создания?) print '### already deleted', transliterate(page.title)
def change_content_action(self, page, content, **kwargs): if page.title in [u'Заглавная страница']: return content all_values = W.cf_values(content) patterns = P.template_cf.findall(content) for value in page.new_values: if value in all_values: print u'In "%s" value "%s" already exist!' % \ (transliterate(page.title), transliterate(value)), \ '!' * 20 continue all_values.append(value) new_tpl = u'{{Cf|%s}}' % ', '.join(all_values) if patterns: new_content = content.replace(patterns[0], new_tpl) for pattern in patterns[1:]: new_content = new_content.replace(pattern, '') else: new_content = new_tpl + '\n\n' + content return new_content
def receive(self, signal, page): if signal == 'page_content_changed': pass # print '@ signal changing received', transliterate(page.title) if self.check(page): if page.title in self.old_titles: print '@@ remove mistake', transliterate(page.title) self.remove_mistake(page) else: display = self.get_display(page) if page.title not in self.old_titles: print '@@ create mistake', transliterate(page.title) Mistake.objects.create(verification=self.verification, word=page.title, display=display) else: print '@@ change mistake', transliterate(page.title) mistake = self.mistakes.get(word=page.title) mistake.display = display mistake.save() elif signal == 'page_deleted': pass # print '@ signal deletion received', transliterate(page.title) if page.title in self.old_titles: print '@@ remove mistake', transliterate(page.title) self.remove_mistake(page)
def process_sub_tree(self, prefix, sub_tree): for sub_title, value in sub_tree: title = u"%s/%s" % (prefix, sub_title) if type(value) == list: self.process_sub_tree(title, value) elif issubclass(value, BaseChecker): print print u'-' * 79 print u'Path:', transliterate(title) module = value.__module__.\ replace(u'checkers.tree.', '') print u'Class:', module, value.__name__ print u'-' * 79 self.build_report(title, value())
def process_potentially_new_item(self, item, i): try: title = item.title() except InvalidTitle: print 'Wrong title', '#' * 120 return if ':' in title: return # try: # page = Page.objects.get(title=title) # except (Page.DoesNotExist, _mysql_exceptions.Warning): # return content, content_to_db, edited, redirect = self.get_data(item) if not edited: return if not self.output_interval or not i % self.output_interval: print dt(), if ':' in title: print ':', else: print '#', print edited, '-', i, '-', if ':' in title: print ':', else: print '#', print transliterate(title), ' // pk=%s' % '?' # page.pk # if item.previous_revision_id == -1 and u'{{-ru-' in content: if u'{{-ru-' in content: created_at = convert_wiki_date(item.oldest_revision.timestamp) print transliterate(title), created_at # if created_at + timedelta(hours=5) > datetime.now(): if created_at + timedelta(days=7) > datetime.now(): if title not in self.titles and item.oldest_revision.user != 'CinBot': print print '-' * 100 print '|', transliterate(item.oldest_revision.user), edited print '|', transliterate(title) print '-' * 100 print self.items.append((title, item.oldest_revision.user, created_at)) self.titles.append(title) if ':' in title: return return edited
def process_items(self): i = 0 print dt(), 'processing all pages on wiktionary' info = AllPagesLastProcessed.objects.all().first() if not info: info = AllPagesLastProcessed.objects.create(title=u'!') last = info.title for item in AllpagesPageGenerator(start=last): # for item in AllpagesPageGenerator(start=u"!"): # namespace=10,14 # for item in AllpagesPageGenerator(start=u"!", namespace=14): # for item in AllpagesPageGenerator(start=u"этимология:апп", namespace=10): i += 1 self.process_item(item, i) if not i % 100: try: info.title = item.title() info.save() except Exception: print >> sys.stderr, dt(), transliterate(info.title), '-', '#' * 10, 'bad title' if i == 100000: return info.title = u'!' info.save()
def process_slug(self, slug, title, checker): print u'Report:', transliterate(title) items = CheckerItem.objects.filter(checker_entry__slug=slug).\ order_by('title') # locale.setlocale(locale.LC_ALL, settings.LOCALE_FOR_INDEX_BUILDER) # sort(cmp=wiki_cmp) items = sorted(items, key=lambda x: x.title) count = len(items) description = checker.description if count: if items[0].key is not None: data = dict() for item in items: data.setdefault(item.key, list()) data[item.key].append(item) # if items[0].key_desc is not None: sorting = { # todo: make this things optional and customizable 'single': dict(), 'medium': dict(), 'frequent': dict(), 'very_frequent': dict(), } # todo: make this things optional and customizable: for key, sub_keys in data.items(): if len(sub_keys) == 1: sub_slug = 'single' elif len(sub_keys) < 20: sub_slug = 'medium' elif len(sub_keys) < 100: sub_slug = 'frequent' else: sub_slug = 'very_frequent' sorting[sub_slug][key] = sub_keys order = ['single', 'medium', 'frequent', 'very_frequent'] ordered_sorting = sorted(sorting.items(), key=lambda x: order.index(x[0])) for slug, slug_data in ordered_sorting: body = '' count = len(slug_data) for key, sub_keys in slug_data.items(): body += checker.sc_key(slug) % key for item in sub_keys: body += checker.sc_sub_key_desc(slug) % \ (item.title, item.key_desc) sub_titles = { 'single': u'Единичные', 'medium': u'Редкие', 'frequent': u'Частые', 'very_frequent': u'Очень частые', } self.save_report(title + u'/' + sub_titles[slug], description, body, count) return else: if items[0].sub_items is not None: body = '' for item in items: body += checker.sc_item(slug) % item.title sub_items = [] for sub_item in item.sub_items.split('\n'): args = checker.unpack(sub_item) sub_items.append(checker.sc_sub_item(slug) % args) if checker.join_sub_items: sub_items_string = \ checker.s_sub_item_joiner.join(sub_items) body += checker.s_sub_items % sub_items_string else: body += u''.join(sub_items) else: body = u''.join([checker.sc_item(slug) % item.title for item in items]) else: body = u"\n''Отчёт пуст''" self.save_report(title, description, body, count)
def process_item(self, item, i): # todo: create external mechanism of pausing work (actual for big processors) try: title = item.title() except InvalidTitle: print 'Wrong title', '#' * 120 return if ':' in title: # todo: we need this only for RecentProcessor if title.startswith(u"Шаблон:"): # if title.startswith(u"Категория:") or title.startswith(u"Шаблон:"): # print '-' * 40 pass else: return if self.readonly: return self.process_item_readonly(item, title, i) # if Page.objects.filter(title=title): # print dt(), title, '- exists' # return page = self.get_page(title, i) if not page: return content, content_to_db, edited, redirect = self.get_data(item) if not edited: # print dt(), title.encode('cp1251'), '-', 'DELETED', '#' * 20 print dt(), '& PAGE WAS DELETED - %d - & %s // pk=%d' \ % (i, transliterate(title), page.pk) # print dt(), transliterate(title), '-', 'DELETED', '#' * 10 # print dt(), '-', 'DELETED', '#' * 20 page.delete_and_log() return # print dt(), edited, '-', i, '-', title.encode('cp1251'), '| pk =', page.pk if not self.output_interval or not i % self.output_interval: print dt(), if ':' in title: print ':', else: print '#', print edited, '-', i, '-', if ':' in title: print ':', else: print '#', print transliterate(title), ' // pk=%s' % page.pk # print dt(), edited, '-', i, '-', '| pk =', page.pk log = transliterate(title) oldest = next(item.revisions(reverseOrder=True, total=1, content=True)) created_at = aware(convert_wiki_date(oldest.timestamp)) created_author = oldest.user created_lang = '?' if oldest.text is None: created_lang = '??' else: # print oldest.text # print repr(oldest.text) # print m = re.search(u'\{\{-([-\w]+|Праславянский)-(?:\|[^}]+)?\}\}', oldest.text, flags=re.MULTILINE | re.UNICODE) m2 = re.search(u'\{\{заголовок\|(en|tr|jbo|la|it|es|fo|da|de|pt|hr|pl|fi|lv|nl|sv|no|io|gd|az|ms|id|nv|nds|nah|hu|nrm|vls|fur|ga|hu|lb|hsb|li|tpi|gv|fr|cy|fy|sc|fo|zu|sw|mg|oc|ca|qu|ln|eo|so|cs|uz|et|vo|ku|su|sk|mi|kw|bar|br|an|sq|bs|af)\|add=\w*\}\}', oldest.text) m_new = re.search(u'\{\{NEW\|lang=([-a-z]+)\|cat=', oldest.text, flags=re.UNICODE | re.IGNORECASE) if m: created_lang = m.group(1) elif re.search(u'^= *Праславянский', oldest.text, flags=re.MULTILINE | re.UNICODE): created_lang = u'Праславянский' elif re.search(u'=<div style="background-color0?:#\{\{h1c\}\}">Эсперанто</div>=', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = u'eo' elif m2: created_lang = m2.group(1) elif re.search(u'== *\[\[?(:w)?:en:[^|]+\|английский\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'== *\[\[?(:w)?:de:[^|]+\|немецкий\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'de' elif re.search(u'== *\[?\[?(английский|english)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'== *\[?\[?(французский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'fr' elif re.search(u'== *\[?\[?(италь?янский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'it' elif re.search(u'== *\[?\[?(Нидерландский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'nl' elif re.search(u'\{\{(английский|en)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'\{\{(Нидерландский|nl)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'nl' elif re.search(u'\{\{(немецкий|de)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'de' elif re.search(u'\{\{(it)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'it' elif m_new: created_lang = m_new.group(1) elif re.search(u'#(redirect|перенаправление)', oldest.text, flags=re.MULTILINE | re.UNICODE | re.IGNORECASE): created_lang = u'-' else: save_file(settings.FILES_PATH + "/errors/created_lang/%s.txt" % page.pk, oldest.text.encode('utf-8')) # print # print transliterate(title), created_at # print transliterate(oldest.user), transliterate(created_lang) # print self.update_data(page, content, content_to_db, edited, redirect, log, created_at, created_author, created_lang) if ':' in title: return return edited