Exemplo n.º 1
0
    def handle(self, *args, **options):

        if options.get('chunks') and options.get('since'):
            print 'conflicting options...'
            return

        self.time_init()
        self.logger_init(options.get('path'))
        if options.get('verbose_out'): self.out_log.setLevel(logging.DEBUG)

        chunks = options.get('chunks') or 10
        since = options.get('since')

        dry = bool(options.get('dry'))
        refresh = bool(options.get('refresh'))
        bot_name = options.get('bot_name') or 'Horus'
        try:
            Dedup.bot = Users.objects.get(username=bot_name)
        except Users.DoesNotExist:
            if not dry:
                Dedup.bot = Users.objects.create(
                    username=bot_name, password='', email='*****@*****.**',
                    since=now(), last_time_active=now().strftime('%Y%m%d'),
                    level=1, is_public=1, send_notifications=0, group_id=1
                    )

        pause_for = options.get('pause_for') or 0
        post_cmnt = bool(options.get('cmnt'))
        url = options.get('url') or 'http://downloads.tatoeba.org/'
        if url[-1] != '/': url += '/'

        self.all_dups = []
        self.all_mains = []
        self.all_audio = []

        self.proceeded_sets = 0
        self.prev_progress = -100

        # incremental vs full scan routes
        if since:
            self.log_report('Running incremental scan at '+self.started_on.strftime('%Y-%m-%d %I:%M %p UTC'))
            # parse date
            since = datetime(*[int(s) for s in since.split('-')]).replace(tzinfo=utc)
            # pull in rows from time range
            self.log_report('Running filter on sentences added since '+since.strftime('%Y-%m-%d %I:%M %p'))
            sents = list(Sentences.objects.filter(created__range=[since, now()]))
            sents = [(int(sha1(sent.text).hexdigest(), 16), sent.lang, sent.id) for sent in sents]
            self.log_report('OK filtered '+str(len(sents))+' sentences')
            
            # tally to eliminate premature duplicates
            sent_tally = self.tally(sents)
            del sents

            # filter out duplicates (could probably be done in 1 raw query...)
            self.log_report('Running filter on sentences to find duplicates')
            dup_set = []            
            for ids in sent_tally.itervalues():
                ids = list(ids)
                sents = list(Sentences.objects.filter(id__in=ids))            
                if len(sents) > 1:
                    dup_set.append(sents)
            self.total_sets = len(dup_set)
            self.log_report('OK '+str(self.total_sets)+' duplicate sets found')
            del sent_tally

            self.log_report('Running deduplication transactions on duplicate sets')
            # deduplicate
            for sents in dup_set:
                # determine main sentence based on priority rules
                main_sent = self.prioritize(sents)
                self.all_audio.extend(list(self.has_audio))
                self.all_mains.append(main_sent.id)
                # separate duplicates from main sentence
                sents.remove(main_sent)
                # filter out ids
                ids = [sent.id for sent in sents]
                self.all_dups.extend(ids)
                # run a deduplication transaction
                self.deduplicate(main_sent, ids, post_cmnt, dry)
                # display percentage progress
                self.proceeded_sets += 1
                self.update_dedup_progress()
                # handle rate limiting
                if pause_for: time.sleep(pause_for)

        else:
            self.log_report('Running full scan at '+self.started_on.strftime('%Y-%m-%d %I:%M %p UTC'))
            # pull in sentences from db in chunks
            self.log_report('Running full table scan in '+str(chunks)+' queries')
            total = Sentences.objects.order_by('-id')[0].id
            sent_tally = defaultdict(set)
            for rng in self.chunked_ranges(chunks, total):
                sents = list(Sentences.objects.filter(id__range=rng))
                sents = [(int(sha1(sent.text).hexdigest(), 16), sent.lang, sent.id) for sent in sents]
                self.log_report('Running duplicate filtering on sentence range: '+ str(rng))
                sent_tally = self.tally(sents, sent_tally)
                self.log_report('OK')
                del sents

            self.total_sets = len(sent_tally)
            self.log_report('OK full table scan and filtering done '+str(self.total_sets)+' duplicate sets found')

            self.log_report('Running deduplication step')
            # deduplicate
            for ids in sent_tally.itervalues():
                process = len(ids) > 1
                if process:
                    # pull in needed rows
                    sents = list(Sentences.objects.filter(id__in=ids))
                    
                    main_sent = self.prioritize(sents)
                    self.all_audio.extend(list(self.has_audio))
                    self.all_mains.append(main_sent.id)

                    # separate duplicates from main sent
                    sents.remove(main_sent)
                    ids.remove(main_sent.id)
                    self.all_dups.extend(ids)
                    
                    # run a deduplication transaction
                    self.deduplicate(main_sent, ids, post_cmnt, dry)

                self.proceeded_sets += 1
                # display percentage progress
                self.update_dedup_progress()

                if process:
                    # handle rate limit
                    if pause_for: time.sleep(pause_for)

        self.log_report('OK '+str(len(self.all_dups))+' sentences merged into '+str(len(self.all_mains))+' sentences')
        
        # verification step
        self.log_report('Running verification step')

        # all audio should exist
        self.log_report('All audio intact? ')
        self.ver_audio = Sentences.objects.filter(id__in=self.all_mains, hasaudio__in=['shtooka', 'from_users']).count() == len(self.all_audio)
        msg = 'YES' if self.ver_audio else 'NO'
        self.log_report(msg)

        # all dups should be gone
        self.log_report('All duplicates removed? ')
        self.ver_dups = Sentences.objects.filter(id__in=self.all_dups).count() == 0
        msg = 'YES' if self.ver_dups else 'NO'
        self.log_report(msg)

        # all mains should exist
        self.log_report('All merged sentences intact? ')
        self.ver_mains = Sentences.objects.filter(id__in=self.all_mains).count() == len(self.all_mains)
        msg = 'YES' if self.ver_mains else 'NO'
        self.log_report(msg)        

        # no links should refer to dups
        self.log_report('Sentences are free from links referring to deleted duplicates? ')
        self.ver_links = SentencesTranslations.objects.filter(sentence_id__in=self.all_dups).count() == 0 and SentencesTranslations.objects.filter(translation_id__in=self.all_dups).count() == 0
        msg = 'YES' if self.ver_links else 'NO'
        self.log_report(msg)

        # refresh sentence numbers for languages
        if refresh:
            self.log_report('Refreshing language statistics')
            self.refresh_lang_stats(dry)
        
        self.log_report('Deduplication finished running successfully at '+now().strftime('%Y-%m-%d %I:%M %p UTC')+', see full log at:')
        self.log_report(url + path.split(self.log_file_path)[-1].replace(' ', '%20'))
        
        # post a wall report if needed
        if options.get('wall') and not dry:
            lft = Wall.objects.all().order_by('-rght')[0].rght + 1
            rght = lft + 1
            w = Wall(
                owner=self.bot.id,
                content=self.report.getvalue(),
                date=now(), modified=now(),
                title='', hidden=0,
                lft=lft, rght=rght
                )
            w.save()
            WallThreadsLastMessage(id=w.id, last_message_date=w.modified).save()
Exemplo n.º 2
0
def sents(db, request):

    # no owner, no audio, no correctness 1-4
    Sentences(text='Normal, not duplicated.',
              lang='eng',
              modified=datetime(2014, 1, 1)).save()
    for i in xrange(3):
        Sentences(text='Normal, duplicated.',
                  lang='eng',
                  modified=datetime(2014, 1, 1)).save()

    # has owner 5-8
    Sentences(text='Has owner, not duplicated.',
              lang='eng',
              user_id=1,
              modified=datetime(2014, 1, 2)).save()
    for i in xrange(2):
        Sentences(text='Has owner, duplicated.',
                  lang='eng',
                  modified=datetime(2014, 1, 2)).save()
    Sentences(text='Has owner, duplicated.',
              lang='eng',
              user_id=1,
              modified=datetime(2014, 1, 2)).save()

    # has audio 9-12
    Sentences(text='Has audio, not duplicated.',
              lang='eng',
              hasaudio='shtooka',
              modified=datetime(2014, 1, 3)).save()
    for i in xrange(2):
        Sentences(text='Has audio, duplicated.',
                  lang='eng',
                  modified=datetime(2014, 1, 3)).save()
    Sentences(text='Has audio, duplicated.',
              lang='eng',
              hasaudio='shtooka',
              modified=datetime(2014, 1, 3)).save()

    # correctness -1  13-16
    Sentences(text='Correctness -1, not duplicated.',
              lang='eng',
              correctness=-1,
              modified=datetime(2014, 1, 4)).save()
    for i in xrange(2):
        Sentences(text='Correctness -1, duplicated.',
                  lang='eng',
                  modified=datetime(2014, 1, 4)).save()
    Sentences(text='Correctness -1, duplicated.',
              lang='eng',
              correctness=-1,
              modified=datetime(2014, 1, 4)).save()

    # has owner, has audio, correctness -1  17-21
    Sentences(text='Has owner, Has audio, Correctness -1, not duplicated.',
              lang='eng',
              user_id=1,
              hasaudio='shtooka',
              correctness=-1,
              modified=datetime(2014, 1, 5)).save()
    Sentences(text='Has owner, Has audio, Correctness -1 duplicated.',
              lang='eng',
              modified=datetime(2014, 1, 5)).save()
    Sentences(text='Has owner, Has audio, Correctness -1 duplicated.',
              lang='eng',
              user_id=1,
              modified=datetime(2014, 1, 5)).save()
    Sentences(text='Has owner, Has audio, Correctness -1 duplicated.',
              lang='eng',
              hasaudio='shtooka',
              modified=datetime(2014, 1, 5)).save()
    Sentences(text='Has owner, Has audio, Correctness -1 duplicated.',
              lang='eng',
              correctness=-1,
              modified=datetime(2014, 1, 5)).save()

    for i in xrange(6, 8 + 1):
        SentenceComments(sentence_id=i,
                         text='Comment on ' + str(i),
                         user_id=1,
                         created=datetime.now(),
                         hidden=0).save()

    SentencesTranslations(sentence_id=6, translation_id=9, distance=1).save()
    SentencesTranslations(sentence_id=9, translation_id=6, distance=1).save()
    SentencesTranslations(sentence_id=7, translation_id=10, distance=1).save()
    SentencesTranslations(sentence_id=10, translation_id=7, distance=1).save()

    TagsSentences(tag_id=1,
                  sentence_id=6,
                  user_id=1,
                  added_time=datetime.now()).save()
    TagsSentences(tag_id=2,
                  sentence_id=7,
                  user_id=1,
                  added_time=datetime.now()).save()
    TagsSentences(tag_id=3,
                  sentence_id=8,
                  user_id=1,
                  added_time=datetime.now()).save()

    SentencesSentencesLists(sentences_list_id=1, sentence_id=6).save()
    SentencesSentencesLists(sentences_list_id=2, sentence_id=7).save()
    SentencesSentencesLists(sentences_list_id=3, sentence_id=8).save()

    FavoritesUsers(user_id=1, favorite_id=6).save()
    FavoritesUsers(user_id=2, favorite_id=7).save()
    FavoritesUsers(user_id=3, favorite_id=8).save()

    SentenceAnnotations(meaning_id=1,
                        text='',
                        modified=datetime.now(),
                        user_id=1,
                        sentence_id=6).save()
    SentenceAnnotations(meaning_id=2,
                        text='',
                        modified=datetime.now(),
                        user_id=1,
                        sentence_id=7).save()
    SentenceAnnotations(meaning_id=3,
                        text='',
                        modified=datetime.now(),
                        user_id=1,
                        sentence_id=8).save()

    SentenceAnnotations(meaning_id=10,
                        text='',
                        modified=datetime.now(),
                        user_id=1,
                        sentence_id=13).save()
    SentenceAnnotations(meaning_id=11,
                        text='',
                        modified=datetime.now(),
                        user_id=1,
                        sentence_id=14).save()
    SentenceAnnotations(meaning_id=12,
                        text='',
                        modified=datetime.now(),
                        user_id=1,
                        sentence_id=15).save()

    Contributions(text='Logs for 6',
                  action='update',
                  user_id=1,
                  datetime=datetime.now(),
                  type='sentence',
                  sentence_id=6).save()
    Contributions(text='Logs for 6',
                  action='insert',
                  user_id=1,
                  datetime=datetime.now(),
                  type='link',
                  sentence_id=6,
                  translation_id=9).save()
    Contributions(text='Logs for 7',
                  action='insert',
                  user_id=1,
                  datetime=datetime.now(),
                  type='sentence',
                  sentence_id=7).save()
    Contributions(text='',
                  action='insert',
                  user_id=1,
                  datetime=datetime.now(),
                  type='sentence',
                  sentence_id=8).save()
    Contributions(text='Unknown datetime record',
                  action='update',
                  user_id=1,
                  datetime=None,
                  type='sentence',
                  sentence_id=8).save()

    Wall(owner=1,
         content='test post',
         date=datetime.utcnow(),
         title='',
         hidden=0,
         lft=1,
         rght=2).save()

    if request.config.option.mysql:

        def fin():
            conn = connections['default']

            def clean_up(model):
                Model = get_model('tatoeba2.' + model)
                Model.objects.all().delete()
                conn.cursor().execute('TRUNCATE TABLE ' +
                                      Model._meta.db_table + ';')
                conn.cursor().execute('ALTER TABLE ' + Model._meta.db_table +
                                      ' AUTO_INCREMENT = 1;')

            clean_up('Sentences')
            clean_up('SentencesTranslations')
            clean_up('SentenceComments')
            clean_up('TagsSentences')
            clean_up('SentencesSentencesLists')
            clean_up('FavoritesUsers')
            clean_up('Contributions')
            clean_up('Users')
            clean_up('Wall')
            clean_up('SentenceAnnotations')

        request.addfinalizer(fin)