Пример #1
0
 def process(self):
     document_import.delay(import_pk=self.instance.pk,
                           user_pk=self.user.pk,
                           report_label=_('Import in %(document_name)s') %
                           {'document_name': self.document.name})
     send_event('document', self.document.pk, "import:queued",
                {"id": self.document.pk})
Пример #2
0
def update_client_state(part_id, task, status, task_id=None, data=None):
    DocumentPart = apps.get_model('core', 'DocumentPart')
    part = DocumentPart.objects.get(pk=part_id)
    task_name = task.split('.')[-1]
    send_event('document', part.document.pk, "part:workflow", {
        "id": part.pk,
        "process": task_name,
        "status": status,
        "task_id": task_id,
        "data": data or {}
    })
Пример #3
0
    def _print_eval(epoch=0, accuracy=0, chars=0, error=0, val_metric=0):
        model.refresh_from_db()
        model.training_epoch = epoch
        model.training_accuracy = accuracy
        model.training_total = int(chars)
        model.training_errors = error
        relpath = os.path.relpath(model_dir, settings.MEDIA_ROOT)
        model.new_version(file=f'{relpath}/version_{epoch}.mlmodel')
        model.save()

        send_event('document', document.pk, "training:eval", {
            "id": model.pk,
            'versions': model.versions,
            'epoch': epoch,
            'accuracy': accuracy,
            'chars': int(chars),
            'error': error})
Пример #4
0
        def _print_eval(epoch=0, accuracy=0, mean_acc=0, mean_iu=0, freq_iu=0,
                        val_metric=0):
            model.refresh_from_db()
            model.training_epoch = epoch
            model.training_accuracy = float(val_metric)
            # model.training_total = chars
            # model.training_errors = error
            relpath = os.path.relpath(model_dir, settings.MEDIA_ROOT)
            model.new_version(file=f'{relpath}/version_{epoch}.mlmodel')
            model.save()

            send_event('document', document_pk, "training:eval", {
                "id": model.pk,
                'versions': model.versions,
                'epoch': epoch,
                'accuracy': float(val_metric)
                # 'chars': chars,
                # 'error': error
            })
Пример #5
0
def recalculate_masks(instance_pk=None, user_pk=None, only=None, **kwargs):
    if user_pk:
        try:
            user = User.objects.get(pk=user_pk)
            # If quotas are enforced, assert that the user still has free CPU minutes
            if not settings.DISABLE_QUOTAS and user.cpu_minutes_limit() != None:
                assert user.has_free_cpu_minutes(), f"User {user.id} doesn't have any CPU minutes left"
        except User.DoesNotExist:
            user = None

    try:
        DocumentPart = apps.get_model('core', 'DocumentPart')
        part = DocumentPart.objects.get(pk=instance_pk)
    except DocumentPart.DoesNotExist as e:
        logger.error('Trying to recalculate masks of innexistant DocumentPart : %d', instance_pk)
        return

    result = part.make_masks(only=only)
    send_event('document', part.document.pk, "part:mask", {
        "id": part.pk,
        "lines": [{'pk': line.pk, 'mask': line.mask} for line in result]
    })
Пример #6
0
def train(task, transcription_pk, model_pk=None, part_pks=None, user_pk=None, **kwargs):
    if user_pk:
        try:
            user = User.objects.get(pk=user_pk)
            # If quotas are enforced, assert that the user still has free CPU minutes, GPU minutes and disk storage
            if not settings.DISABLE_QUOTAS:
                if user.cpu_minutes_limit() != None:
                    assert user.has_free_cpu_minutes(), f"User {user.id} doesn't have any CPU minutes left"
                if user.gpu_minutes_limit() != None:
                    assert user.has_free_gpu_minutes(), f"User {user.id} doesn't have any GPU minutes left"
                if user.disk_storage_limit() != None:
                    assert user.has_free_disk_storage(), f"User {user.id} doesn't have any disk storage left"
        except User.DoesNotExist:
            user = None
    else:
        user = None

    redis_.set('training-%d' % model_pk, json.dumps({'task_id': task.request.id}))

    Transcription = apps.get_model('core', 'Transcription')
    LineTranscription = apps.get_model('core', 'LineTranscription')
    OcrModel = apps.get_model('core', 'OcrModel')

    try:
        model = OcrModel.objects.get(pk=model_pk)
        model.training = True
        model.save()
        transcription = Transcription.objects.get(pk=transcription_pk)
        document = transcription.document
        send_event('document', document.pk, "training:start", {
            "id": model.pk,
        })
        qs = (LineTranscription.objects
              .filter(transcription=transcription,
                      line__document_part__pk__in=part_pks)
              .exclude(Q(content='') | Q(content=None)))
        train_(qs, document, transcription, model=model, user=user)
    except Exception as e:
        send_event('document', document.pk, "training:error", {
            "id": model.pk,
        })
        if user:
            user.notify(_("Something went wrong during the training process!"),
                        id="training-error", level='danger')
        logger.exception(e)
    else:
        if user:
            user.notify(_("Training finished!"),
                        id="training-success",
                        level='success')
    finally:
        model.training = False
        model.file_size = model.file.size
        model.save()

        send_event('document', document.pk, "training:done", {
            "id": model.pk,
        })
Пример #7
0
def document_export(task,
                    file_format,
                    part_pks,
                    transcription_pk,
                    region_types,
                    document_pk=None,
                    include_images=False,
                    user_pk=None,
                    report_label=None):
    ALTO_FORMAT = "alto"
    PAGEXML_FORMAT = "pagexml"
    TEXT_FORMAT = "text"

    User = apps.get_model('users', 'User')
    Document = apps.get_model('core', 'Document')
    DocumentPart = apps.get_model('core', 'DocumentPart')
    Transcription = apps.get_model('core', 'Transcription')
    LineTranscription = apps.get_model('core', 'LineTranscription')
    TaskReport = apps.get_model('reporting', 'TaskReport')

    user = User.objects.get(pk=user_pk)

    # If quotas are enforced, assert that the user still has free CPU minutes
    if not settings.DISABLE_QUOTAS and user.cpu_minutes_limit() != None:
        assert user.has_free_cpu_minutes(
        ), f"User {user.id} doesn't have any CPU minutes left"

    document = Document.objects.get(pk=document_pk)
    report = TaskReport.objects.get(task_id=task.request.id)

    try:
        send_event('document', document.pk, "export:start",
                   {"id": document.pk})

        # Check if we have to include orphan lines
        include_orphans = False
        if 'Orphan' in region_types:
            include_orphans = True
            region_types.remove('Orphan')

        # Check if we have to include lines with an undefined region type
        include_undefined = False
        if 'Undefined' in region_types:
            include_undefined = True
            region_types.remove('Undefined')

        transcription = Transcription.objects.get(document=document,
                                                  pk=transcription_pk)

        base_filename = "export_doc%d_%s_%s_%s" % (
            document.pk, slugify(document.name).replace('-', '_')[:32],
            file_format, datetime.now().strftime('%Y%m%d%H%M'))

        if file_format == TEXT_FORMAT:
            filename = "%s.txt" % base_filename
            filepath = os.path.join(user.get_document_store_path(), filename)
            # content_type = 'text/plain'

            region_filters = Q(line__block__typology_id__in=region_types)
            if include_orphans:
                region_filters |= Q(line__block__isnull=True)
            if include_undefined:
                region_filters |= Q(line__block__isnull=False,
                                    line__block__typology_id__isnull=True)

            lines = (LineTranscription.objects.filter(
                transcription=transcription,
                line__document_part__pk__in=part_pks).filter(
                    region_filters).exclude(content="").order_by(
                        'line__document_part', 'line__document_part__order',
                        'line__order'))
            # return StreamingHttpResponse(['%s\n' % line.content for line in lines],
            #                              content_type=content_type)
            with open(filepath, 'w') as fh:
                fh.writelines(['%s\n' % line.content for line in lines])
                fh.close()

        elif file_format == ALTO_FORMAT or file_format == PAGEXML_FORMAT:
            filename = "%s.zip" % base_filename
            filepath = os.path.join(user.get_document_store_path(), filename)
            # buff = io.BytesIO()
            if file_format == ALTO_FORMAT:
                tplt = loader.get_template('export/alto.xml')
            elif file_format == PAGEXML_FORMAT:
                tplt = loader.get_template('export/pagexml.xml')
            parts = DocumentPart.objects.filter(document=document,
                                                pk__in=part_pks)

            region_filters = Q(typology_id__in=region_types)
            if include_undefined:
                region_filters |= Q(typology_id__isnull=True)

            with ZipFile(filepath, 'w') as zip_:
                for part in parts:
                    render_orphans = {} if not include_orphans else {
                        'orphan_lines':
                        part.lines.prefetch_transcription(
                            transcription).filter(block=None)
                    }

                    if include_images:
                        # Note adds image before the xml file
                        zip_.write(part.image.path, part.filename)
                    try:
                        page = tplt.render({
                            'valid_block_types':
                            document.valid_block_types.all(),
                            'valid_line_types':
                            document.valid_line_types.all(),
                            'part':
                            part,
                            'blocks':
                            (part.blocks.filter(region_filters).annotate(
                                avglo=Avg('lines__order')).order_by('avglo').
                             prefetch_related(
                                 Prefetch(
                                     'lines',
                                     queryset=Line.objects.
                                     prefetch_transcription(transcription)))),
                            **render_orphans
                        })
                    except Exception as e:
                        report.append(
                            "Skipped {element}({image}) because '{reason}'.".
                            format(element=part.name,
                                   image=part.filename,
                                   reason=str(e)))
                    else:
                        zip_.writestr(
                            '%s.xml' % os.path.splitext(part.filename)[0],
                            page)

                zip_.close()

    except Exception as e:
        if user:
            user.notify(_("Something went wrong during the export!"),
                        links=[{
                            'text': 'Report',
                            'src': report.uri
                        }],
                        id="export-error",
                        level='danger')

        send_event('document', document.pk, "import:fail", {
            "id": document.pk,
            "reason": str(e)
        })
        logger.exception(e)
        report.error(str(e))

    else:
        rel_path = os.path.relpath(filepath, settings.MEDIA_ROOT)
        report.end()

        user.notify(_('Export done!'),
                    level='success',
                    links=[{
                        'text': _('Download'),
                        'src': settings.MEDIA_URL + rel_path
                    }])

        # send websocket msg
        send_event('document', document.pk, "export:done", {"id": document.pk})

        # send email
        from django.contrib.sites.models import Site
        send_email('export/email/ready_subject.txt',
                   'export/email/ready_message.txt',
                   'export/email/ready_html.html', (user.email, ),
                   context={
                       'domain': Site.objects.get_current().domain,
                       'export_uri': rel_path
                   })
Пример #8
0
def document_import(task,
                    import_pk=None,
                    resume=True,
                    task_id=None,
                    user_pk=None,
                    report_label=None):
    DocumentImport = apps.get_model('imports', 'DocumentImport')
    TaskReport = apps.get_model('reporting', 'TaskReport')
    User = apps.get_model('users', 'User')

    user = User.objects.get(pk=user_pk)
    # If quotas are enforced, assert that the user still has free CPU minutes and disk storage
    if not settings.DISABLE_QUOTAS:
        if user.cpu_minutes_limit() != None:
            assert user.has_free_cpu_minutes(
            ), f"User {user.id} doesn't have any CPU minutes left"
        if user.disk_storage_limit() != None:
            assert user.has_free_disk_storage(
            ), f"User {user.id} doesn't have any disk storage left"

    imp = DocumentImport.objects.get(
        Q(workflow_state=DocumentImport.WORKFLOW_STATE_CREATED)
        | Q(workflow_state=DocumentImport.WORKFLOW_STATE_ERROR),
        pk=import_pk)

    imp.report = TaskReport.objects.get(task_id=task.request.id)
    imp.save()

    try:
        send_event('document', imp.document.pk, "import:start",
                   {"id": imp.document.pk})

        for obj in imp.process(resume=resume):
            send_event(
                'document', imp.document.pk, "import:progress", {
                    "id": imp.document.pk,
                    "progress": imp.processed,
                    "total": imp.total
                })
    except Exception as e:
        if user:
            user.notify(_("Something went wrong during the import!"),
                        links=[{
                            'text': 'Report',
                            'src': imp.report.uri
                        }],
                        id="import-error",
                        level='danger')

        send_event('document', imp.document.pk, "import:fail", {
            "id": imp.document.pk,
            "reason": str(e)
        })
        logger.exception(e)
        imp.report.error(str(e))
    else:
        if user:
            if imp.report.messages:
                user.notify(_("Import finished with warnings!"),
                            links=[{
                                'text': _('Details'),
                                'src': imp.report.uri
                            }],
                            level='warning')
            else:
                user.notify(_("Import done!"), level='success')
        send_event('document', imp.document.pk, "import:done",
                   {"id": imp.document.pk})
        imp.report.end()
Пример #9
0
def segtrain(task, model_pk, part_pks, document_pk=None, user_pk=None, **kwargs):
    # # Note hack to circumvent AssertionError: daemonic processes are not allowed to have children
    from multiprocessing import current_process
    current_process().daemon = False

    if user_pk:
        try:
            user = User.objects.get(pk=user_pk)
            # If quotas are enforced, assert that the user still has free CPU minutes, GPU minutes and disk storage
            if not settings.DISABLE_QUOTAS:
                if user.cpu_minutes_limit() != None:
                    assert user.has_free_cpu_minutes(), f"User {user.id} doesn't have any CPU minutes left"
                if user.gpu_minutes_limit() != None:
                    assert user.has_free_gpu_minutes(), f"User {user.id} doesn't have any GPU minutes left"
                if user.disk_storage_limit() != None:
                    assert user.has_free_disk_storage(), f"User {user.id} doesn't have any disk storage left"
        except User.DoesNotExist:
            user = None
    else:
        user = None

    def msg(txt, fg=None, nl=False):
        logger.info(txt)

    redis_.set('segtrain-%d' % model_pk, json.dumps({'task_id': task.request.id}))

    Document = apps.get_model('core', 'Document')
    DocumentPart = apps.get_model('core', 'DocumentPart')
    OcrModel = apps.get_model('core', 'OcrModel')

    model = OcrModel.objects.get(pk=model_pk)

    try:
        load = model.file.path
    except ValueError:  # model is empty
        load = settings.KRAKEN_DEFAULT_SEGMENTATION_MODEL
        model.file = model.file.field.upload_to(model, slugify(model.name) + '.mlmodel')

    model_dir = os.path.join(settings.MEDIA_ROOT, os.path.split(model.file.path)[0])

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    try:
        model.training = True
        model.save()
        send_event('document', document_pk, "training:start", {
            "id": model.pk,
        })
        qs = DocumentPart.objects.filter(pk__in=part_pks).prefetch_related('lines')

        ground_truth = list(qs)
        if ground_truth[0].document.line_offset == Document.LINE_OFFSET_TOPLINE:
            topline = True
        elif ground_truth[0].document.line_offset == Document.LINE_OFFSET_CENTERLINE:
            topline = None
        else:
            topline = False

        np.random.default_rng(241960353267317949653744176059648850006).shuffle(ground_truth)
        partition = max(1, int(len(ground_truth) / 10))

        training_data = []
        evaluation_data = []
        for part in qs[partition:]:
            training_data.append(make_segmentation_training_data(part))
        for part in qs[:partition]:
            evaluation_data.append(make_segmentation_training_data(part))

        DEVICE = getattr(settings, 'KRAKEN_TRAINING_DEVICE', 'cpu')
        LOAD_THREADS = getattr(settings, 'KRAKEN_TRAINING_LOAD_THREADS', 0)
        trainer = kraken_train.KrakenTrainer.segmentation_train_gen(
            message=msg,
            output=os.path.join(model_dir, 'version'),
            format_type=None,
            device=DEVICE,
            load=load,
            training_data=training_data,
            evaluation_data=evaluation_data,
            threads=LOAD_THREADS,
            augment=True,
            resize='both',
            hyper_params={'epochs': 30},
            load_hyper_parameters=True,
            topline=topline
        )

        def _print_eval(epoch=0, accuracy=0, mean_acc=0, mean_iu=0, freq_iu=0,
                        val_metric=0):
            model.refresh_from_db()
            model.training_epoch = epoch
            model.training_accuracy = float(val_metric)
            # model.training_total = chars
            # model.training_errors = error
            relpath = os.path.relpath(model_dir, settings.MEDIA_ROOT)
            model.new_version(file=f'{relpath}/version_{epoch}.mlmodel')
            model.save()

            send_event('document', document_pk, "training:eval", {
                "id": model.pk,
                'versions': model.versions,
                'epoch': epoch,
                'accuracy': float(val_metric)
                # 'chars': chars,
                # 'error': error
            })

        trainer.run(_print_eval)

        best_version = os.path.join(model_dir,
                                    f'version_{trainer.stopper.best_epoch}.mlmodel')

        try:
            shutil.copy(best_version, model.file.path)  # os.path.join(model_dir, filename)
        except FileNotFoundError as e:
            user.notify(_("Training didn't get better results than base model!"),
                        id="seg-no-gain-error", level='warning')
            shutil.copy(load, model.file.path)

    except Exception as e:
        send_event('document', document_pk, "training:error", {
            "id": model.pk,
        })
        if user:
            user.notify(_("Something went wrong during the segmenter training process!"),
                        id="training-error", level='danger')
        logger.exception(e)
        raise e
    else:
        if user:
            user.notify(_("Training finished!"),
                        id="training-success",
                        level='success')
    finally:
        model.training = False
        model.file_size = model.file.size
        model.save()

        send_event('document', document_pk, "training:done", {
            "id": model.pk,
        })