def check(request, document_id, grade): document = get_object_or_404(Document, pk=document_id) if request.method == 'POST': WordFormSet = modelformset_factory( LocalWord, form=RestrictedWordForm, exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), can_delete=True) formset = WordFormSet(request.POST) if formset.is_valid(): instances = formset.save(commit=False) for instance in instances: instance.grade = grade instance.document = document instance.save() writeLocalTables([document]) redirect = 'dictionary_check_g1' if grade == 1 else 'dictionary_check_g2' return HttpResponseRedirect(reverse(redirect, args=[document_id])) else: return render(request, 'dictionary/words.html', locals()) # filter some words from the xml content = document.latest_version().content content.open() # strip='none': if this parameter is not set, whitespace is removed automatically for documents with a DOCTYPE declaration tree = etree.parse(saxon9he(content.file, os.path.join(settings.PROJECT_DIR, 'dictionary', 'xslt', 'filter.xsl'), '-strip:none', contraction=grade).stdout, parser=HUGE_TREE_PARSER) content.close() # grab the homographs homographs = set(("|".join(homograph.xpath('text()')).lower() for homograph in tree.xpath('//brl:homograph', namespaces=BRL_NAMESPACE))) duplicate_homographs = set((smart_unicode(word) for word in chain(GlobalWord.objects.filter(grade=grade).filter(type=5).filter(homograph_disambiguation__in=homographs).values_list('homograph_disambiguation', flat=True), LocalWord.objects.filter(grade=grade).filter(type=5).filter(document=document).filter(homograph_disambiguation__in=homographs).values_list('homograph_disambiguation', flat=True)))) unknown_homographs = [{'untranslated': homograph.replace('|', ''), 'braille': translate(getTables(grade), homograph.replace('|', unichr(0x250A))), 'type': 5, 'homograph_disambiguation': homograph} for homograph in homographs - duplicate_homographs] # grab names and places names = set((name for names in (name.text.lower().split() for name in tree.xpath('//brl:name', namespaces=BRL_NAMESPACE) if name.text != None) for name in names)) duplicate_names = set((smart_unicode(word) for word in chain(GlobalWord.objects.filter(grade=grade).filter(type__in=(1,2)).filter(untranslated__in=names).values_list('untranslated', flat=True), LocalWord.objects.filter(grade=grade).filter(type__in=(1,2)).filter(document=document).filter(untranslated__in=names).values_list('untranslated', flat=True)))) unknown_names = [{'untranslated': name, 'braille': translate(getTables(grade, name=True), name), 'type': 2, 'homograph_disambiguation': ''} for name in names - duplicate_names] places = set((place for places in (place.text.lower().split() for place in tree.xpath('//brl:place', namespaces=BRL_NAMESPACE) if place.text != None) for place in places)) duplicate_places = set((smart_unicode(word) for word in chain(GlobalWord.objects.filter(grade=grade).filter(type__in=(3,4)).filter(untranslated__in=places).values_list('untranslated', flat=True), LocalWord.objects.filter(grade=grade).filter(type__in=(3,4)).filter(document=document).filter(untranslated__in=places).values_list('untranslated', flat=True)))) unknown_places = [{'untranslated': place, 'braille': translate(getTables(grade, place=True), place), 'type': 4, 'homograph_disambiguation': ''} for place in places - duplicate_places] # filter homographs, names and places from the xml xsl = etree.parse(os.path.join(settings.PROJECT_DIR, 'dictionary', 'xslt', 'filter_names.xsl'), parser=HUGE_TREE_PARSER) transform = etree.XSLT(xsl) filtered_tree = transform(tree) # grab the rest of the content content = etree.tostring(filtered_tree, method="text", encoding=unicode) # filter all punctuation and replace dashes by space, so we can split by space below content = ''.join( # replace Punctuation Dash and Punctuation other (except for "'") with space c if c == u"\u0027" or unicodedata.category(c) not in ['Pd', 'Po'] else ' ' for c in content # drop all chars which are not letters, separators or select # punctuation which we replace with space later on if unicodedata.category(c) in ['Lu', 'Ll', 'Zs', 'Zl', 'Zp', 'Pd', 'Po'] or c in ['\n', '\r']) new_words = set((w.lower() for w in content.split() if len(w) > 1)) # FIXME: We basically do a set difference manually here. This # would probably be better if done inside the db. However for that # we would have to be able to insert the new_words into the db in # an efficient manner, i.e. bulk insert. For a possibility on how # to do this in the context of Django ORM look at # http://ole-laursen.blogspot.com/2010/11/bulk-inserting-django-objects.html. # After that we could for example do a query along the lines of # cursor.execute("SELECT untranslated from new_words EXCEPT SELECT # untranslated FROM dict_words;). However MySQL doesn't seem to # support EXCEPT so it would be SELECT untranslated FROM new_words # w1 LEFT JOIN dict_words w2 ON w1.untranslated=w2.untranslated # WHERE w2.untranslated IS NULL; duplicate_words = set((smart_unicode(word) for word in # exclude type 2,4 and 5 as these probably have a different # translations, so we do need to show these words if they are not # tagged even if they have an entry in the dictionary as a name or # a place. chain(GlobalWord.objects.filter(grade=grade).exclude(type__in=(2,4,5)).filter(untranslated__in=new_words).values_list('untranslated', flat=True), LocalWord.objects.filter(grade=grade).exclude(type__in=(2,4,5)).filter(document=document).filter(untranslated__in=new_words).values_list('untranslated', flat=True)))) unknown_words = [{'untranslated': word, 'braille': translate(getTables(grade), word), 'type' : 0, 'homograph_disambiguation': ''} for word in new_words - duplicate_words] unknown_words = unknown_words + unknown_homographs + unknown_names + unknown_places unknown_words.sort(cmp=lambda x,y: cmp(x['untranslated'].lower(), y['untranslated'].lower())) # remove words from the local words which are no longer in the document (they might have # been typos that slipped in to the local words and were corrected subsequently) all_duplicates = duplicate_homographs | duplicate_names | duplicate_places | duplicate_words LocalWord.objects.filter(grade=grade, document=document).exclude(untranslated__in=all_duplicates).delete() paginator = Paginator(unknown_words, MAX_WORDS_PER_PAGE) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: words = paginator.page(page) except InvalidPage: words = paginator.page(paginator.num_pages) WordFormSet = modelformset_factory( LocalWord, form=RestrictedWordForm, exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), extra=len(words.object_list), can_delete=True) have_type = any((word['type']!=0 for word in words.object_list)) have_homograph_disambiguation = any((word['homograph_disambiguation']!='' for word in words.object_list)) formset = WordFormSet(queryset=LocalWord.objects.none(), initial=words.object_list) # Document statistic stats = DocumentStatistic(document=document, grade=grade, total=len(new_words), unknown=len(unknown_words)) percentage = 100.0*stats.unknown/stats.total stats.save() return render(request, 'dictionary/words.html', locals())
def edit_global_words_with_missing_braille(request): WordFormSet = formset_factory(GlobalWordBothGradesForm, extra=0) if request.method == 'POST': formset = WordFormSet(request.POST) if formset.is_valid(): for form in formset.forms: GlobalWord.objects.create( untranslated=form.cleaned_data['untranslated'], braille=form.cleaned_data['grade2'] if form.cleaned_data['original_grade'] == 1 else form.cleaned_data['grade1'], grade=2 if form.cleaned_data['original_grade'] == 1 else 1, type=form.cleaned_data['type'], homograph_disambiguation=form.cleaned_data['homograph_disambiguation']) return HttpResponseRedirect(reverse('dictionary_edit_global_words_with_missing_braille')) else: return render(request, 'dictionary/edit_missing_globals.html', locals()) WORDS_WITH_MISSING_BRAILLE = """ SELECT l.* FROM dictionary_globalword AS l WHERE NOT EXISTS ( SELECT NULL FROM dictionary_globalword AS r WHERE l.untranslated = r.untranslated AND l.type = r.type AND l.homograph_disambiguation = r.homograph_disambiguation AND l.grade != r.grade ) ORDER BY l.untranslated """ single_grade_words = GlobalWord.objects.raw(WORDS_WITH_MISSING_BRAILLE) missing_words = [{'untranslated': smart_unicode(word.untranslated), 'original_grade': word.grade, 'grade1': smart_unicode(word.braille) if word.grade == 1 else translate(getTables(1), smart_unicode(word.untranslated)), 'grade2': smart_unicode(word.braille) if word.grade == 2 else translate(getTables(2), smart_unicode(word.untranslated)), 'type' : word.type, 'homograph_disambiguation': smart_unicode(word.homograph_disambiguation)} for word in single_grade_words] paginator = Paginator(missing_words, MAX_WORDS_PER_PAGE) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: words = paginator.page(page) except InvalidPage: words = paginator.page(paginator.num_pages) formset = WordFormSet(initial=words.object_list) return render(request, 'dictionary/edit_missing_globals.html', locals())
def confirm(request, grade, deferred=False): if [word for word in get_conflicting_words(grade)]: redirect = ('dictionary_confirm_deferred_conflicting_duplicates_g' if deferred else 'dictionary_confirm_conflicting_duplicates_g') + str(grade) return HttpResponseRedirect(reverse(redirect)) WordFormSet = formset_factory(ConfirmDeferredWordForm if deferred else ConfirmWordForm, extra=0) if request.method == 'POST': formset = WordFormSet(request.POST) if formset.is_valid(): # FIXME: in Djano 1.3+ formset formmsets are iterable, so you can just say # for form in formset: for form in formset.forms: update_word_tables(form, grade, deferred) # FIXME: in principle we need to regenerate the liblouis tables, # i.e. the white lists now. However we do this asynchronously # (using a cron job) for now. There are several reasons for this: # 1) It is slow as hell if done inside a transaction. To do this # outside the transaction we need transaction context managers # (https://docs.djangoproject.com/en/1.3/topics/db/transactions/#controlling-transaction-management-in-views) # which are only available in Django 1.3. # 2) We need to serialize the table writing so they do not write # on top of each other. This is easy if it is done periodically. # 3) Of course it would be nice to use some kind of message queue # for this (e.g. rabbitmq and celery), but for now this poor mans # solution seems good enough # redirect to self as there might be more words redirect = ('dictionary_confirm_deferred_g' if deferred else 'dictionary_confirm_g') + str(grade) return HttpResponseRedirect(reverse(redirect)) else: return render(request, 'dictionary/confirm.html', locals()) # create a default for all unconfirmed homographs which have no default, i.e. no restriction word entry unconfirmed_homographs = set((smart_unicode(word) for word in LocalWord.objects.filter(grade=grade, type=5, isConfirmed=False, isDeferred=deferred, document__state__sort_order=final_sort_order).values_list('untranslated', flat=True))) if unconfirmed_homographs: covered_entries = set((smart_unicode(word) for word in chain( LocalWord.objects.filter(grade=grade, type=0, untranslated__in=unconfirmed_homographs).values_list('untranslated', flat=True), GlobalWord.objects.filter(grade=grade, type=0, untranslated__in=unconfirmed_homographs).values_list('untranslated', flat=True)))) for word in unconfirmed_homographs - covered_entries: document = Document.objects.filter(localword__grade=grade, localword__type=5, localword__isConfirmed=False, localword__untranslated=word)[0] w = LocalWord(untranslated=word, braille=translate(getTables(grade), word), grade=grade, type=0, document=document) w.save() filterform = FilterForm(request.GET) if filterform.is_valid(): currentFilter = filterform.cleaned_data['filter'] words_to_confirm = LocalWord.objects.filter(grade=grade, isConfirmed=False, isDeferred=deferred, untranslated__contains=currentFilter, document__state__sort_order=final_sort_order).order_by('untranslated', 'type').values('untranslated', 'braille', 'type', 'homograph_disambiguation', 'isLocal').distinct() paginator = Paginator(words_to_confirm, MAX_WORDS_PER_PAGE) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: words = paginator.page(page) except InvalidPage: words = paginator.page(paginator.num_pages) have_type = any((word['type']!=0 for word in words.object_list)) have_homograph_disambiguation = any((word['homograph_disambiguation']!='' for word in words.object_list)) formset = WordFormSet(initial=words.object_list) return render(request, 'dictionary/confirm.html', locals())