def handle(self, *args, **options): # write new global white lists verbosity = int(options['verbosity']) if verbosity >= 2: self.stderr.write('Writing new global white lists...\n') writeWhiteListTables(GlobalWord.objects.order_by('untranslated')) # update local tables if verbosity >= 2: self.stderr.write('Updating local tables...\n') writeLocalTables(Document.objects.all())
def confirm_conflicting_duplicates(request, grade, deferred=False): WordFormSet = formset_factory(ConflictingWordForm, extra=0) if request.method == 'POST': formset = WordFormSet(request.POST) if formset.is_valid(): affected_documents = set() # save the correct words in the GlobalWord # FIXME: in Djano 1.3+ formset formmsets are iterable, so you can just say # for form in formset: for form in formset.forms: # FIXME: This is an open attack vector. A user can # change any word in the global dict with a carefuly # crafted post. It might be better not to pass the id. word = GlobalWord(grade=grade, **form.cleaned_data) word.save() # note which documents are affected filter_args = dict((k, form.cleaned_data[k]) for k in ('untranslated', 'type', 'homograph_disambiguation')) words_to_delete = LocalWord.objects.filter(grade=grade, **filter_args) affected_documents.update([word.document for word in words_to_delete]) # delete the conflicting words (and also plain # duplicate non-conflicting words) from the LocalWords words_to_delete.delete() writeLocalTables(list(affected_documents)) # once we are done dealing with conflicts we go back to regular confirmation redirect = 'dictionary_confirm_g1' if grade == 1 else 'dictionary_confirm_g2' return HttpResponseRedirect(reverse(redirect)) else: conflicting_words = get_conflicting_words(grade) braille_choices = defaultdict(set) global_ids = defaultdict() for untranslated, type, homograph_disambiguation, braille, global_id in conflicting_words: key = (untranslated, type, homograph_disambiguation) braille_choices[key].update([braille]) if global_id > 0: global_ids[key] = global_id initial=[ {'id': global_ids.get((untranslated, type, homograph_disambiguation)), 'untranslated': untranslated, 'type': type, 'homograph_disambiguation': homograph_disambiguation, 'braille': sorted(braille_choices[(untranslated, type, homograph_disambiguation)]), } for untranslated, type, homograph_disambiguation in braille_choices.keys()] initial = sorted(initial, key=lambda x: x['untranslated']) WordFormSet = formset_factory(ConflictingWordForm, extra=0) formset = WordFormSet(initial=initial) return render_to_response('dictionary/confirm_conflicting_duplicates.html', locals(), context_instance=RequestContext(request))
def local(request, document_id, grade): document = get_object_or_404(Document, pk=document_id) if request.method == 'POST': WordFormSet = modelformset_factory( LocalWord, form=RestrictedWordForm, exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), can_delete=True) formset = WordFormSet(request.POST, queryset=LocalWord.objects.filter(grade=grade, document=document)) if formset.is_valid(): instances = formset.save() writeLocalTables([document]) redirect = 'dictionary_local_g1' if grade == 1 else 'dictionary_local_g2' return HttpResponseRedirect(reverse(redirect, args=[document_id])) else: return render_to_response('dictionary/local.html', locals(), context_instance=RequestContext(request)) filterform = FilterForm(request.GET) if filterform.is_valid(): currentFilter = filterform.cleaned_data['filter'] words_list = LocalWord.objects.filter(grade=grade, document=document, untranslated__contains=currentFilter).order_by('untranslated', 'type') paginator = Paginator(words_list, MAX_WORDS_PER_PAGE) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: words = paginator.page(page) except InvalidPage: words = paginator.page(paginator.num_pages) WordFormSet = modelformset_factory( LocalWord, form=RestrictedWordForm, exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), can_delete=True, extra=0) formset = WordFormSet(queryset=words.object_list) return render_to_response('dictionary/local.html', locals(), context_instance=RequestContext(request))
def check(request, document_id, grade): document = get_object_or_404(Document, pk=document_id) if request.method == 'POST': WordFormSet = modelformset_factory( LocalWord, form=RestrictedWordForm, exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), can_delete=True) formset = WordFormSet(request.POST) if formset.is_valid(): instances = formset.save(commit=False) for instance in instances: instance.grade = grade instance.document = document instance.save() writeLocalTables([document]) redirect = 'dictionary_check_g1' if grade == 1 else 'dictionary_check_g2' return HttpResponseRedirect(reverse(redirect, args=[document_id])) else: return render(request, 'dictionary/words.html', locals()) # filter some words from the xml content = document.latest_version().content content.open() # strip='none': if this parameter is not set, whitespace is removed automatically for documents with a DOCTYPE declaration tree = etree.parse(saxon9he(content.file, os.path.join(settings.PROJECT_DIR, 'dictionary', 'xslt', 'filter.xsl'), '-strip:none', contraction=grade).stdout, parser=HUGE_TREE_PARSER) content.close() # grab the homographs homographs = set(("|".join(homograph.xpath('text()')).lower() for homograph in tree.xpath('//brl:homograph', namespaces=BRL_NAMESPACE))) duplicate_homographs = set((smart_unicode(word) for word in chain(GlobalWord.objects.filter(grade=grade).filter(type=5).filter(homograph_disambiguation__in=homographs).values_list('homograph_disambiguation', flat=True), LocalWord.objects.filter(grade=grade).filter(type=5).filter(document=document).filter(homograph_disambiguation__in=homographs).values_list('homograph_disambiguation', flat=True)))) unknown_homographs = [{'untranslated': homograph.replace('|', ''), 'braille': translate(getTables(grade), homograph.replace('|', unichr(0x250A))), 'type': 5, 'homograph_disambiguation': homograph} for homograph in homographs - duplicate_homographs] # grab names and places names = set((name for names in (name.text.lower().split() for name in tree.xpath('//brl:name', namespaces=BRL_NAMESPACE) if name.text != None) for name in names)) duplicate_names = set((smart_unicode(word) for word in chain(GlobalWord.objects.filter(grade=grade).filter(type__in=(1,2)).filter(untranslated__in=names).values_list('untranslated', flat=True), LocalWord.objects.filter(grade=grade).filter(type__in=(1,2)).filter(document=document).filter(untranslated__in=names).values_list('untranslated', flat=True)))) unknown_names = [{'untranslated': name, 'braille': translate(getTables(grade, name=True), name), 'type': 2, 'homograph_disambiguation': ''} for name in names - duplicate_names] places = set((place for places in (place.text.lower().split() for place in tree.xpath('//brl:place', namespaces=BRL_NAMESPACE) if place.text != None) for place in places)) duplicate_places = set((smart_unicode(word) for word in chain(GlobalWord.objects.filter(grade=grade).filter(type__in=(3,4)).filter(untranslated__in=places).values_list('untranslated', flat=True), LocalWord.objects.filter(grade=grade).filter(type__in=(3,4)).filter(document=document).filter(untranslated__in=places).values_list('untranslated', flat=True)))) unknown_places = [{'untranslated': place, 'braille': translate(getTables(grade, place=True), place), 'type': 4, 'homograph_disambiguation': ''} for place in places - duplicate_places] # filter homographs, names and places from the xml xsl = etree.parse(os.path.join(settings.PROJECT_DIR, 'dictionary', 'xslt', 'filter_names.xsl'), parser=HUGE_TREE_PARSER) transform = etree.XSLT(xsl) filtered_tree = transform(tree) # grab the rest of the content content = etree.tostring(filtered_tree, method="text", encoding=unicode) # filter all punctuation and replace dashes by space, so we can split by space below content = ''.join( # replace Punctuation Dash and Punctuation other (except for "'") with space c if c == u"\u0027" or unicodedata.category(c) not in ['Pd', 'Po'] else ' ' for c in content # drop all chars which are not letters, separators or select # punctuation which we replace with space later on if unicodedata.category(c) in ['Lu', 'Ll', 'Zs', 'Zl', 'Zp', 'Pd', 'Po'] or c in ['\n', '\r']) new_words = set((w.lower() for w in content.split() if len(w) > 1)) # FIXME: We basically do a set difference manually here. This # would probably be better if done inside the db. However for that # we would have to be able to insert the new_words into the db in # an efficient manner, i.e. bulk insert. For a possibility on how # to do this in the context of Django ORM look at # http://ole-laursen.blogspot.com/2010/11/bulk-inserting-django-objects.html. # After that we could for example do a query along the lines of # cursor.execute("SELECT untranslated from new_words EXCEPT SELECT # untranslated FROM dict_words;). However MySQL doesn't seem to # support EXCEPT so it would be SELECT untranslated FROM new_words # w1 LEFT JOIN dict_words w2 ON w1.untranslated=w2.untranslated # WHERE w2.untranslated IS NULL; duplicate_words = set((smart_unicode(word) for word in # exclude type 2,4 and 5 as these probably have a different # translations, so we do need to show these words if they are not # tagged even if they have an entry in the dictionary as a name or # a place. chain(GlobalWord.objects.filter(grade=grade).exclude(type__in=(2,4,5)).filter(untranslated__in=new_words).values_list('untranslated', flat=True), LocalWord.objects.filter(grade=grade).exclude(type__in=(2,4,5)).filter(document=document).filter(untranslated__in=new_words).values_list('untranslated', flat=True)))) unknown_words = [{'untranslated': word, 'braille': translate(getTables(grade), word), 'type' : 0, 'homograph_disambiguation': ''} for word in new_words - duplicate_words] unknown_words = unknown_words + unknown_homographs + unknown_names + unknown_places unknown_words.sort(cmp=lambda x,y: cmp(x['untranslated'].lower(), y['untranslated'].lower())) # remove words from the local words which are no longer in the document (they might have # been typos that slipped in to the local words and were corrected subsequently) all_duplicates = duplicate_homographs | duplicate_names | duplicate_places | duplicate_words LocalWord.objects.filter(grade=grade, document=document).exclude(untranslated__in=all_duplicates).delete() paginator = Paginator(unknown_words, MAX_WORDS_PER_PAGE) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: words = paginator.page(page) except InvalidPage: words = paginator.page(paginator.num_pages) WordFormSet = modelformset_factory( LocalWord, form=RestrictedWordForm, exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), extra=len(words.object_list), can_delete=True) have_type = any((word['type']!=0 for word in words.object_list)) have_homograph_disambiguation = any((word['homograph_disambiguation']!='' for word in words.object_list)) formset = WordFormSet(queryset=LocalWord.objects.none(), initial=words.object_list) # Document statistic stats = DocumentStatistic(document=document, grade=grade, total=len(new_words), unknown=len(unknown_words)) percentage = 100.0*stats.unknown/stats.total stats.save() return render(request, 'dictionary/words.html', locals())