def handle(self, *args, **options): if options.get('chunks') and options.get('since'): print 'conflicting options...' return self.time_init() self.logger_init(options.get('path')) if options.get('verbose_out'): self.out_log.setLevel(logging.DEBUG) chunks = options.get('chunks') or 10 since = options.get('since') dry = bool(options.get('dry')) refresh = bool(options.get('refresh')) bot_name = options.get('bot_name') or 'Horus' try: Dedup.bot = Users.objects.get(username=bot_name) except Users.DoesNotExist: if not dry: Dedup.bot = Users.objects.create( username=bot_name, password='', email='*****@*****.**', since=now(), last_time_active=now().strftime('%Y%m%d'), level=1, is_public=1, send_notifications=0, group_id=1 ) pause_for = options.get('pause_for') or 0 post_cmnt = bool(options.get('cmnt')) url = options.get('url') or 'http://downloads.tatoeba.org/' if url[-1] != '/': url += '/' self.all_dups = [] self.all_mains = [] self.all_audio = [] self.proceeded_sets = 0 self.prev_progress = -100 # incremental vs full scan routes if since: self.log_report('Running incremental scan at '+self.started_on.strftime('%Y-%m-%d %I:%M %p UTC')) # parse date since = datetime(*[int(s) for s in since.split('-')]).replace(tzinfo=utc) # pull in rows from time range self.log_report('Running filter on sentences added since '+since.strftime('%Y-%m-%d %I:%M %p')) sents = list(Sentences.objects.filter(created__range=[since, now()])) sents = [(int(sha1(sent.text).hexdigest(), 16), sent.lang, sent.id) for sent in sents] self.log_report('OK filtered '+str(len(sents))+' sentences') # tally to eliminate premature duplicates sent_tally = self.tally(sents) del sents # filter out duplicates (could probably be done in 1 raw query...) self.log_report('Running filter on sentences to find duplicates') dup_set = [] for ids in sent_tally.itervalues(): ids = list(ids) sents = list(Sentences.objects.filter(id__in=ids)) if len(sents) > 1: dup_set.append(sents) self.total_sets = len(dup_set) self.log_report('OK '+str(self.total_sets)+' duplicate sets found') del sent_tally self.log_report('Running deduplication transactions on duplicate sets') # deduplicate for sents in dup_set: # determine main sentence based on priority rules main_sent = self.prioritize(sents) self.all_audio.extend(list(self.has_audio)) self.all_mains.append(main_sent.id) # separate duplicates from main sentence sents.remove(main_sent) # filter out ids ids = [sent.id for sent in sents] self.all_dups.extend(ids) # run a deduplication transaction self.deduplicate(main_sent, ids, post_cmnt, dry) # display percentage progress self.proceeded_sets += 1 self.update_dedup_progress() # handle rate limiting if pause_for: time.sleep(pause_for) else: self.log_report('Running full scan at '+self.started_on.strftime('%Y-%m-%d %I:%M %p UTC')) # pull in sentences from db in chunks self.log_report('Running full table scan in '+str(chunks)+' queries') total = Sentences.objects.order_by('-id')[0].id sent_tally = defaultdict(set) for rng in self.chunked_ranges(chunks, total): sents = list(Sentences.objects.filter(id__range=rng)) sents = [(int(sha1(sent.text).hexdigest(), 16), sent.lang, sent.id) for sent in sents] self.log_report('Running duplicate filtering on sentence range: '+ str(rng)) sent_tally = self.tally(sents, sent_tally) self.log_report('OK') del sents self.total_sets = len(sent_tally) self.log_report('OK full table scan and filtering done '+str(self.total_sets)+' duplicate sets found') self.log_report('Running deduplication step') # deduplicate for ids in sent_tally.itervalues(): process = len(ids) > 1 if process: # pull in needed rows sents = list(Sentences.objects.filter(id__in=ids)) main_sent = self.prioritize(sents) self.all_audio.extend(list(self.has_audio)) self.all_mains.append(main_sent.id) # separate duplicates from main sent sents.remove(main_sent) ids.remove(main_sent.id) self.all_dups.extend(ids) # run a deduplication transaction self.deduplicate(main_sent, ids, post_cmnt, dry) self.proceeded_sets += 1 # display percentage progress self.update_dedup_progress() if process: # handle rate limit if pause_for: time.sleep(pause_for) self.log_report('OK '+str(len(self.all_dups))+' sentences merged into '+str(len(self.all_mains))+' sentences') # verification step self.log_report('Running verification step') # all audio should exist self.log_report('All audio intact? ') self.ver_audio = Sentences.objects.filter(id__in=self.all_mains, hasaudio__in=['shtooka', 'from_users']).count() == len(self.all_audio) msg = 'YES' if self.ver_audio else 'NO' self.log_report(msg) # all dups should be gone self.log_report('All duplicates removed? ') self.ver_dups = Sentences.objects.filter(id__in=self.all_dups).count() == 0 msg = 'YES' if self.ver_dups else 'NO' self.log_report(msg) # all mains should exist self.log_report('All merged sentences intact? ') self.ver_mains = Sentences.objects.filter(id__in=self.all_mains).count() == len(self.all_mains) msg = 'YES' if self.ver_mains else 'NO' self.log_report(msg) # no links should refer to dups self.log_report('Sentences are free from links referring to deleted duplicates? ') self.ver_links = SentencesTranslations.objects.filter(sentence_id__in=self.all_dups).count() == 0 and SentencesTranslations.objects.filter(translation_id__in=self.all_dups).count() == 0 msg = 'YES' if self.ver_links else 'NO' self.log_report(msg) # refresh sentence numbers for languages if refresh: self.log_report('Refreshing language statistics') self.refresh_lang_stats(dry) self.log_report('Deduplication finished running successfully at '+now().strftime('%Y-%m-%d %I:%M %p UTC')+', see full log at:') self.log_report(url + path.split(self.log_file_path)[-1].replace(' ', '%20')) # post a wall report if needed if options.get('wall') and not dry: lft = Wall.objects.all().order_by('-rght')[0].rght + 1 rght = lft + 1 w = Wall( owner=self.bot.id, content=self.report.getvalue(), date=now(), modified=now(), title='', hidden=0, lft=lft, rght=rght ) w.save() WallThreadsLastMessage(id=w.id, last_message_date=w.modified).save()
def sents(db, request): # no owner, no audio, no correctness 1-4 Sentences(text='Normal, not duplicated.', lang='eng', modified=datetime(2014, 1, 1)).save() for i in xrange(3): Sentences(text='Normal, duplicated.', lang='eng', modified=datetime(2014, 1, 1)).save() # has owner 5-8 Sentences(text='Has owner, not duplicated.', lang='eng', user_id=1, modified=datetime(2014, 1, 2)).save() for i in xrange(2): Sentences(text='Has owner, duplicated.', lang='eng', modified=datetime(2014, 1, 2)).save() Sentences(text='Has owner, duplicated.', lang='eng', user_id=1, modified=datetime(2014, 1, 2)).save() # has audio 9-12 Sentences(text='Has audio, not duplicated.', lang='eng', hasaudio='shtooka', modified=datetime(2014, 1, 3)).save() for i in xrange(2): Sentences(text='Has audio, duplicated.', lang='eng', modified=datetime(2014, 1, 3)).save() Sentences(text='Has audio, duplicated.', lang='eng', hasaudio='shtooka', modified=datetime(2014, 1, 3)).save() # correctness -1 13-16 Sentences(text='Correctness -1, not duplicated.', lang='eng', correctness=-1, modified=datetime(2014, 1, 4)).save() for i in xrange(2): Sentences(text='Correctness -1, duplicated.', lang='eng', modified=datetime(2014, 1, 4)).save() Sentences(text='Correctness -1, duplicated.', lang='eng', correctness=-1, modified=datetime(2014, 1, 4)).save() # has owner, has audio, correctness -1 17-21 Sentences(text='Has owner, Has audio, Correctness -1, not duplicated.', lang='eng', user_id=1, hasaudio='shtooka', correctness=-1, modified=datetime(2014, 1, 5)).save() Sentences(text='Has owner, Has audio, Correctness -1 duplicated.', lang='eng', modified=datetime(2014, 1, 5)).save() Sentences(text='Has owner, Has audio, Correctness -1 duplicated.', lang='eng', user_id=1, modified=datetime(2014, 1, 5)).save() Sentences(text='Has owner, Has audio, Correctness -1 duplicated.', lang='eng', hasaudio='shtooka', modified=datetime(2014, 1, 5)).save() Sentences(text='Has owner, Has audio, Correctness -1 duplicated.', lang='eng', correctness=-1, modified=datetime(2014, 1, 5)).save() for i in xrange(6, 8 + 1): SentenceComments(sentence_id=i, text='Comment on ' + str(i), user_id=1, created=datetime.now(), hidden=0).save() SentencesTranslations(sentence_id=6, translation_id=9, distance=1).save() SentencesTranslations(sentence_id=9, translation_id=6, distance=1).save() SentencesTranslations(sentence_id=7, translation_id=10, distance=1).save() SentencesTranslations(sentence_id=10, translation_id=7, distance=1).save() TagsSentences(tag_id=1, sentence_id=6, user_id=1, added_time=datetime.now()).save() TagsSentences(tag_id=2, sentence_id=7, user_id=1, added_time=datetime.now()).save() TagsSentences(tag_id=3, sentence_id=8, user_id=1, added_time=datetime.now()).save() SentencesSentencesLists(sentences_list_id=1, sentence_id=6).save() SentencesSentencesLists(sentences_list_id=2, sentence_id=7).save() SentencesSentencesLists(sentences_list_id=3, sentence_id=8).save() FavoritesUsers(user_id=1, favorite_id=6).save() FavoritesUsers(user_id=2, favorite_id=7).save() FavoritesUsers(user_id=3, favorite_id=8).save() SentenceAnnotations(meaning_id=1, text='', modified=datetime.now(), user_id=1, sentence_id=6).save() SentenceAnnotations(meaning_id=2, text='', modified=datetime.now(), user_id=1, sentence_id=7).save() SentenceAnnotations(meaning_id=3, text='', modified=datetime.now(), user_id=1, sentence_id=8).save() SentenceAnnotations(meaning_id=10, text='', modified=datetime.now(), user_id=1, sentence_id=13).save() SentenceAnnotations(meaning_id=11, text='', modified=datetime.now(), user_id=1, sentence_id=14).save() SentenceAnnotations(meaning_id=12, text='', modified=datetime.now(), user_id=1, sentence_id=15).save() Contributions(text='Logs for 6', action='update', user_id=1, datetime=datetime.now(), type='sentence', sentence_id=6).save() Contributions(text='Logs for 6', action='insert', user_id=1, datetime=datetime.now(), type='link', sentence_id=6, translation_id=9).save() Contributions(text='Logs for 7', action='insert', user_id=1, datetime=datetime.now(), type='sentence', sentence_id=7).save() Contributions(text='', action='insert', user_id=1, datetime=datetime.now(), type='sentence', sentence_id=8).save() Contributions(text='Unknown datetime record', action='update', user_id=1, datetime=None, type='sentence', sentence_id=8).save() Wall(owner=1, content='test post', date=datetime.utcnow(), title='', hidden=0, lft=1, rght=2).save() if request.config.option.mysql: def fin(): conn = connections['default'] def clean_up(model): Model = get_model('tatoeba2.' + model) Model.objects.all().delete() conn.cursor().execute('TRUNCATE TABLE ' + Model._meta.db_table + ';') conn.cursor().execute('ALTER TABLE ' + Model._meta.db_table + ' AUTO_INCREMENT = 1;') clean_up('Sentences') clean_up('SentencesTranslations') clean_up('SentenceComments') clean_up('TagsSentences') clean_up('SentencesSentencesLists') clean_up('FavoritesUsers') clean_up('Contributions') clean_up('Users') clean_up('Wall') clean_up('SentenceAnnotations') request.addfinalizer(fin)