def process(self): document_import.delay(import_pk=self.instance.pk, user_pk=self.user.pk, report_label=_('Import in %(document_name)s') % {'document_name': self.document.name}) send_event('document', self.document.pk, "import:queued", {"id": self.document.pk})
def update_client_state(part_id, task, status, task_id=None, data=None): DocumentPart = apps.get_model('core', 'DocumentPart') part = DocumentPart.objects.get(pk=part_id) task_name = task.split('.')[-1] send_event('document', part.document.pk, "part:workflow", { "id": part.pk, "process": task_name, "status": status, "task_id": task_id, "data": data or {} })
def _print_eval(epoch=0, accuracy=0, chars=0, error=0, val_metric=0): model.refresh_from_db() model.training_epoch = epoch model.training_accuracy = accuracy model.training_total = int(chars) model.training_errors = error relpath = os.path.relpath(model_dir, settings.MEDIA_ROOT) model.new_version(file=f'{relpath}/version_{epoch}.mlmodel') model.save() send_event('document', document.pk, "training:eval", { "id": model.pk, 'versions': model.versions, 'epoch': epoch, 'accuracy': accuracy, 'chars': int(chars), 'error': error})
def _print_eval(epoch=0, accuracy=0, mean_acc=0, mean_iu=0, freq_iu=0, val_metric=0): model.refresh_from_db() model.training_epoch = epoch model.training_accuracy = float(val_metric) # model.training_total = chars # model.training_errors = error relpath = os.path.relpath(model_dir, settings.MEDIA_ROOT) model.new_version(file=f'{relpath}/version_{epoch}.mlmodel') model.save() send_event('document', document_pk, "training:eval", { "id": model.pk, 'versions': model.versions, 'epoch': epoch, 'accuracy': float(val_metric) # 'chars': chars, # 'error': error })
def recalculate_masks(instance_pk=None, user_pk=None, only=None, **kwargs): if user_pk: try: user = User.objects.get(pk=user_pk) # If quotas are enforced, assert that the user still has free CPU minutes if not settings.DISABLE_QUOTAS and user.cpu_minutes_limit() != None: assert user.has_free_cpu_minutes(), f"User {user.id} doesn't have any CPU minutes left" except User.DoesNotExist: user = None try: DocumentPart = apps.get_model('core', 'DocumentPart') part = DocumentPart.objects.get(pk=instance_pk) except DocumentPart.DoesNotExist as e: logger.error('Trying to recalculate masks of innexistant DocumentPart : %d', instance_pk) return result = part.make_masks(only=only) send_event('document', part.document.pk, "part:mask", { "id": part.pk, "lines": [{'pk': line.pk, 'mask': line.mask} for line in result] })
def train(task, transcription_pk, model_pk=None, part_pks=None, user_pk=None, **kwargs): if user_pk: try: user = User.objects.get(pk=user_pk) # If quotas are enforced, assert that the user still has free CPU minutes, GPU minutes and disk storage if not settings.DISABLE_QUOTAS: if user.cpu_minutes_limit() != None: assert user.has_free_cpu_minutes(), f"User {user.id} doesn't have any CPU minutes left" if user.gpu_minutes_limit() != None: assert user.has_free_gpu_minutes(), f"User {user.id} doesn't have any GPU minutes left" if user.disk_storage_limit() != None: assert user.has_free_disk_storage(), f"User {user.id} doesn't have any disk storage left" except User.DoesNotExist: user = None else: user = None redis_.set('training-%d' % model_pk, json.dumps({'task_id': task.request.id})) Transcription = apps.get_model('core', 'Transcription') LineTranscription = apps.get_model('core', 'LineTranscription') OcrModel = apps.get_model('core', 'OcrModel') try: model = OcrModel.objects.get(pk=model_pk) model.training = True model.save() transcription = Transcription.objects.get(pk=transcription_pk) document = transcription.document send_event('document', document.pk, "training:start", { "id": model.pk, }) qs = (LineTranscription.objects .filter(transcription=transcription, line__document_part__pk__in=part_pks) .exclude(Q(content='') | Q(content=None))) train_(qs, document, transcription, model=model, user=user) except Exception as e: send_event('document', document.pk, "training:error", { "id": model.pk, }) if user: user.notify(_("Something went wrong during the training process!"), id="training-error", level='danger') logger.exception(e) else: if user: user.notify(_("Training finished!"), id="training-success", level='success') finally: model.training = False model.file_size = model.file.size model.save() send_event('document', document.pk, "training:done", { "id": model.pk, })
def document_export(task, file_format, part_pks, transcription_pk, region_types, document_pk=None, include_images=False, user_pk=None, report_label=None): ALTO_FORMAT = "alto" PAGEXML_FORMAT = "pagexml" TEXT_FORMAT = "text" User = apps.get_model('users', 'User') Document = apps.get_model('core', 'Document') DocumentPart = apps.get_model('core', 'DocumentPart') Transcription = apps.get_model('core', 'Transcription') LineTranscription = apps.get_model('core', 'LineTranscription') TaskReport = apps.get_model('reporting', 'TaskReport') user = User.objects.get(pk=user_pk) # If quotas are enforced, assert that the user still has free CPU minutes if not settings.DISABLE_QUOTAS and user.cpu_minutes_limit() != None: assert user.has_free_cpu_minutes( ), f"User {user.id} doesn't have any CPU minutes left" document = Document.objects.get(pk=document_pk) report = TaskReport.objects.get(task_id=task.request.id) try: send_event('document', document.pk, "export:start", {"id": document.pk}) # Check if we have to include orphan lines include_orphans = False if 'Orphan' in region_types: include_orphans = True region_types.remove('Orphan') # Check if we have to include lines with an undefined region type include_undefined = False if 'Undefined' in region_types: include_undefined = True region_types.remove('Undefined') transcription = Transcription.objects.get(document=document, pk=transcription_pk) base_filename = "export_doc%d_%s_%s_%s" % ( document.pk, slugify(document.name).replace('-', '_')[:32], file_format, datetime.now().strftime('%Y%m%d%H%M')) if file_format == TEXT_FORMAT: filename = "%s.txt" % base_filename filepath = os.path.join(user.get_document_store_path(), filename) # content_type = 'text/plain' region_filters = Q(line__block__typology_id__in=region_types) if include_orphans: region_filters |= Q(line__block__isnull=True) if include_undefined: region_filters |= Q(line__block__isnull=False, line__block__typology_id__isnull=True) lines = (LineTranscription.objects.filter( transcription=transcription, line__document_part__pk__in=part_pks).filter( region_filters).exclude(content="").order_by( 'line__document_part', 'line__document_part__order', 'line__order')) # return StreamingHttpResponse(['%s\n' % line.content for line in lines], # content_type=content_type) with open(filepath, 'w') as fh: fh.writelines(['%s\n' % line.content for line in lines]) fh.close() elif file_format == ALTO_FORMAT or file_format == PAGEXML_FORMAT: filename = "%s.zip" % base_filename filepath = os.path.join(user.get_document_store_path(), filename) # buff = io.BytesIO() if file_format == ALTO_FORMAT: tplt = loader.get_template('export/alto.xml') elif file_format == PAGEXML_FORMAT: tplt = loader.get_template('export/pagexml.xml') parts = DocumentPart.objects.filter(document=document, pk__in=part_pks) region_filters = Q(typology_id__in=region_types) if include_undefined: region_filters |= Q(typology_id__isnull=True) with ZipFile(filepath, 'w') as zip_: for part in parts: render_orphans = {} if not include_orphans else { 'orphan_lines': part.lines.prefetch_transcription( transcription).filter(block=None) } if include_images: # Note adds image before the xml file zip_.write(part.image.path, part.filename) try: page = tplt.render({ 'valid_block_types': document.valid_block_types.all(), 'valid_line_types': document.valid_line_types.all(), 'part': part, 'blocks': (part.blocks.filter(region_filters).annotate( avglo=Avg('lines__order')).order_by('avglo'). prefetch_related( Prefetch( 'lines', queryset=Line.objects. prefetch_transcription(transcription)))), **render_orphans }) except Exception as e: report.append( "Skipped {element}({image}) because '{reason}'.". format(element=part.name, image=part.filename, reason=str(e))) else: zip_.writestr( '%s.xml' % os.path.splitext(part.filename)[0], page) zip_.close() except Exception as e: if user: user.notify(_("Something went wrong during the export!"), links=[{ 'text': 'Report', 'src': report.uri }], id="export-error", level='danger') send_event('document', document.pk, "import:fail", { "id": document.pk, "reason": str(e) }) logger.exception(e) report.error(str(e)) else: rel_path = os.path.relpath(filepath, settings.MEDIA_ROOT) report.end() user.notify(_('Export done!'), level='success', links=[{ 'text': _('Download'), 'src': settings.MEDIA_URL + rel_path }]) # send websocket msg send_event('document', document.pk, "export:done", {"id": document.pk}) # send email from django.contrib.sites.models import Site send_email('export/email/ready_subject.txt', 'export/email/ready_message.txt', 'export/email/ready_html.html', (user.email, ), context={ 'domain': Site.objects.get_current().domain, 'export_uri': rel_path })
def document_import(task, import_pk=None, resume=True, task_id=None, user_pk=None, report_label=None): DocumentImport = apps.get_model('imports', 'DocumentImport') TaskReport = apps.get_model('reporting', 'TaskReport') User = apps.get_model('users', 'User') user = User.objects.get(pk=user_pk) # If quotas are enforced, assert that the user still has free CPU minutes and disk storage if not settings.DISABLE_QUOTAS: if user.cpu_minutes_limit() != None: assert user.has_free_cpu_minutes( ), f"User {user.id} doesn't have any CPU minutes left" if user.disk_storage_limit() != None: assert user.has_free_disk_storage( ), f"User {user.id} doesn't have any disk storage left" imp = DocumentImport.objects.get( Q(workflow_state=DocumentImport.WORKFLOW_STATE_CREATED) | Q(workflow_state=DocumentImport.WORKFLOW_STATE_ERROR), pk=import_pk) imp.report = TaskReport.objects.get(task_id=task.request.id) imp.save() try: send_event('document', imp.document.pk, "import:start", {"id": imp.document.pk}) for obj in imp.process(resume=resume): send_event( 'document', imp.document.pk, "import:progress", { "id": imp.document.pk, "progress": imp.processed, "total": imp.total }) except Exception as e: if user: user.notify(_("Something went wrong during the import!"), links=[{ 'text': 'Report', 'src': imp.report.uri }], id="import-error", level='danger') send_event('document', imp.document.pk, "import:fail", { "id": imp.document.pk, "reason": str(e) }) logger.exception(e) imp.report.error(str(e)) else: if user: if imp.report.messages: user.notify(_("Import finished with warnings!"), links=[{ 'text': _('Details'), 'src': imp.report.uri }], level='warning') else: user.notify(_("Import done!"), level='success') send_event('document', imp.document.pk, "import:done", {"id": imp.document.pk}) imp.report.end()
def segtrain(task, model_pk, part_pks, document_pk=None, user_pk=None, **kwargs): # # Note hack to circumvent AssertionError: daemonic processes are not allowed to have children from multiprocessing import current_process current_process().daemon = False if user_pk: try: user = User.objects.get(pk=user_pk) # If quotas are enforced, assert that the user still has free CPU minutes, GPU minutes and disk storage if not settings.DISABLE_QUOTAS: if user.cpu_minutes_limit() != None: assert user.has_free_cpu_minutes(), f"User {user.id} doesn't have any CPU minutes left" if user.gpu_minutes_limit() != None: assert user.has_free_gpu_minutes(), f"User {user.id} doesn't have any GPU minutes left" if user.disk_storage_limit() != None: assert user.has_free_disk_storage(), f"User {user.id} doesn't have any disk storage left" except User.DoesNotExist: user = None else: user = None def msg(txt, fg=None, nl=False): logger.info(txt) redis_.set('segtrain-%d' % model_pk, json.dumps({'task_id': task.request.id})) Document = apps.get_model('core', 'Document') DocumentPart = apps.get_model('core', 'DocumentPart') OcrModel = apps.get_model('core', 'OcrModel') model = OcrModel.objects.get(pk=model_pk) try: load = model.file.path except ValueError: # model is empty load = settings.KRAKEN_DEFAULT_SEGMENTATION_MODEL model.file = model.file.field.upload_to(model, slugify(model.name) + '.mlmodel') model_dir = os.path.join(settings.MEDIA_ROOT, os.path.split(model.file.path)[0]) if not os.path.exists(model_dir): os.makedirs(model_dir) try: model.training = True model.save() send_event('document', document_pk, "training:start", { "id": model.pk, }) qs = DocumentPart.objects.filter(pk__in=part_pks).prefetch_related('lines') ground_truth = list(qs) if ground_truth[0].document.line_offset == Document.LINE_OFFSET_TOPLINE: topline = True elif ground_truth[0].document.line_offset == Document.LINE_OFFSET_CENTERLINE: topline = None else: topline = False np.random.default_rng(241960353267317949653744176059648850006).shuffle(ground_truth) partition = max(1, int(len(ground_truth) / 10)) training_data = [] evaluation_data = [] for part in qs[partition:]: training_data.append(make_segmentation_training_data(part)) for part in qs[:partition]: evaluation_data.append(make_segmentation_training_data(part)) DEVICE = getattr(settings, 'KRAKEN_TRAINING_DEVICE', 'cpu') LOAD_THREADS = getattr(settings, 'KRAKEN_TRAINING_LOAD_THREADS', 0) trainer = kraken_train.KrakenTrainer.segmentation_train_gen( message=msg, output=os.path.join(model_dir, 'version'), format_type=None, device=DEVICE, load=load, training_data=training_data, evaluation_data=evaluation_data, threads=LOAD_THREADS, augment=True, resize='both', hyper_params={'epochs': 30}, load_hyper_parameters=True, topline=topline ) def _print_eval(epoch=0, accuracy=0, mean_acc=0, mean_iu=0, freq_iu=0, val_metric=0): model.refresh_from_db() model.training_epoch = epoch model.training_accuracy = float(val_metric) # model.training_total = chars # model.training_errors = error relpath = os.path.relpath(model_dir, settings.MEDIA_ROOT) model.new_version(file=f'{relpath}/version_{epoch}.mlmodel') model.save() send_event('document', document_pk, "training:eval", { "id": model.pk, 'versions': model.versions, 'epoch': epoch, 'accuracy': float(val_metric) # 'chars': chars, # 'error': error }) trainer.run(_print_eval) best_version = os.path.join(model_dir, f'version_{trainer.stopper.best_epoch}.mlmodel') try: shutil.copy(best_version, model.file.path) # os.path.join(model_dir, filename) except FileNotFoundError as e: user.notify(_("Training didn't get better results than base model!"), id="seg-no-gain-error", level='warning') shutil.copy(load, model.file.path) except Exception as e: send_event('document', document_pk, "training:error", { "id": model.pk, }) if user: user.notify(_("Something went wrong during the segmenter training process!"), id="training-error", level='danger') logger.exception(e) raise e else: if user: user.notify(_("Training finished!"), id="training-success", level='success') finally: model.training = False model.file_size = model.file.size model.save() send_event('document', document_pk, "training:done", { "id": model.pk, })