Exemplo n.º 1
0
    def test_delete_pages(self):
        # Create a document with two pages
        src_file_path = os.path.join(BASE_DIR, "data", "berlin.pdf")

        with open(src_file_path, 'rb') as fp:
            src_file = fp.read()

        init_kwargs = {'payload': src_file, 'processor': 'TEST'}
        apply_kwargs = {'skip_ocr': True, 'name': "berlin.pdf"}
        self.assertIsNotNone(
            go_through_pipelines(init_kwargs=init_kwargs,
                                 apply_kwargs=apply_kwargs))

        doc = Document.objects.get(title="berlin.pdf")
        self.assertEqual(doc.page_count, 2)
        # initial version of any document is 0
        self.assertEqual(doc.version, 0)

        doc.delete_pages(page_numbers=[1], skip_migration=True)

        self.assertEqual(doc.page_count, 1)

        self.assertEqual(doc.pages.count(), 1)

        # version should have been incremented
        self.assertEqual(doc.version, 1)
Exemplo n.º 2
0
 def test_simple_pipeline_txt(self):
     payload_data = ''.join(random.choices(
         string.ascii_uppercase + string.digits, k=100)).encode()
     payload = b''.join([MAGIC_BYTES[1][1], payload_data])
     for processor in PROCESSORS:
         init_kwargs = self.make_init_kwargs(
             payload=payload, processor=processor)
         apply_kwargs = self.make_apply_kwargs()
         doc = go_through_pipelines(init_kwargs, apply_kwargs)
         self.assertIsNone(doc)
Exemplo n.º 3
0
 def test_default_pipeline_jpg(self):
     file_path = os.path.join(
         BASE_DIR,
         "data",
         "page-1.jpg"
     )
     with open(file_path, 'rb') as f:
         payload = f.read()
     for processor in PROCESSORS:
         init_kwargs = self.make_init_kwargs(
             payload=payload, processor=processor)
         apply_kwargs = self.make_apply_kwargs()
         doc = go_through_pipelines(init_kwargs, apply_kwargs)
         self.assertIsNotNone(doc)
Exemplo n.º 4
0
 def test_simple_pipeline_pdf(self):
     file_path = os.path.join(
         BASE_DIR,
         "data",
         "berlin.pdf"
     )
     with open(file_path, 'rb') as f:
         payload = f.read()
     for processor in PROCESSORS:
         init_kwargs = self.make_init_kwargs(
             payload=payload, processor=processor)
         apply_kwargs = self.make_apply_kwargs()
         doc = go_through_pipelines(init_kwargs, apply_kwargs)
         self.assertIsNotNone(doc)
         self.assertEqual(doc.name, 'test_change_name')
Exemplo n.º 5
0
    def test_import_file(self):
        src_file_path = os.path.join(BASE_DIR, "data", "berlin.pdf")

        with open(src_file_path, 'rb') as fp:
            src_file = fp.read()

        init_kwargs = {'payload': src_file, 'processor': 'TEST'}
        apply_kwargs = {'skip_ocr': True, 'name': "berlin.pdf"}

        self.assertIsNotNone(
            go_through_pipelines(init_kwargs=init_kwargs,
                                 apply_kwargs=apply_kwargs))

        self.assertEqual(
            Document.objects.filter(title="berlin.pdf").count(), 1,
            "Document berlin.pdf was not created.")
Exemplo n.º 6
0
    def put(self, request, filename):
        file_obj = request.data['file']
        init_kwargs = {'payload': file_obj, 'processor': WEB}

        apply_kwargs = {
            'user': request.user.username,
            'name': filename,
            'apply_async': True
        }

        doc = go_through_pipelines(init_kwargs=init_kwargs,
                                   apply_kwargs=apply_kwargs)

        if isinstance(doc, Document):
            serializer = DocumentSerializer(doc)
            return Response(serializer.data)

        return Response(status=200)
Exemplo n.º 7
0
def import_documents(directory, skip_ocr=False):
    files = []

    if not directory:
        raise ValueError("Import directory value is None")

    for entry in os.scandir(directory):
        if entry.is_file():
            with open(entry.path, 'rb') as file_handler:
                file_hash = blake2b()
                file_bytes = file_handler.read()
                file_hash.update(file_bytes)
            file_tuple = (entry.path, file_hash.digest())
            files.append(file_tuple)
        else:
            logger.warning(
                "Skipping %s as it is not a file",
                entry.path
            )

    if not files:
        return

    time.sleep(int(settings.PAPERMERGE_FILES_MIN_UNMODIFIED_DURATION))

    for file_path, file_hash in files:
        with open(file_path, 'rb') as file_handler:
            file_bytes = file_handler.read()
            file_hash_new = blake2b()
            file_hash_new.update(file_bytes)
            if not file_hash == file_hash_new.digest():
                continue
            # File has not been modified and can be consumed
            basename = os.path.basename(file_path)
            init_kwargs = {'payload': file_bytes, 'processor': LOCAL}
            apply_kwargs = {'user': None,
                            'name': basename,
                            'skip_ocr': skip_ocr
                            }
            doc = go_through_pipelines(init_kwargs, apply_kwargs)
            if doc is not None:
                os.remove(file_path)
Exemplo n.º 8
0
def read_email_message(message, user=None, skip_ocr=False):
    """
    message is an instance of python's module email.message
    """
    ingested = False

    for part in message.iter_attachments():
        try:
            payload = part.get_content()
        except KeyError:
            continue
        init_kwargs = {'payload': payload, 'processor': IMAP}
        apply_kwargs = {
            'user': user,
            'name': part.get_filename(),
            'skip_ocr': skip_ocr
        }
        doc = go_through_pipelines(init_kwargs, apply_kwargs)
        if doc is not None and not ingested:
            ingested = True

    return ingested
Exemplo n.º 9
0
def upload(request):
    """
    To understand returned value, have a look at
    papermerge.core.views.decorators.json_reponse decorator
    """
    files = request.FILES.getlist('file')
    if not files:
        logger.warning("POST request.FILES is empty. Forgot adding file?")
        return "Missing input file", 400

    if len(files) > 1:
        msg = "More then one files per ajax? how come?"
        logger.warning(msg)

        return msg, 400

    f = files[0]

    logger.debug("upload for f=%s user=%s", f, request.user)

    user = request.user
    parent_id = request.POST.get('parent', "-1")
    parent_id = filter_node_id(parent_id)

    lang = request.POST.get('language')
    notes = request.POST.get('notes')

    init_kwargs = {'payload': f, 'processor': WEB}

    apply_kwargs = {
        'user': user,
        'name': f.name,
        'parent': parent_id,
        'lang': lang,
        'notes': notes,
        'apply_async': True
    }

    doc = go_through_pipelines(init_kwargs, apply_kwargs)

    if not doc:
        status = 400
        msg = _("File type not supported."
                " Only pdf, tiff, png, jpeg files are supported")
        return msg, status

    # after each upload return a json object with
    # following fields:
    #
    # - title
    # - preview_url
    # - doc_id
    # - action_url  -> needed for renaming/deleting selected item
    #
    # with that info a new thumbnail will be created.
    preview_url = reverse('core:preview', args=(doc.id, 200, 1))

    result = {
        'title': doc.title,
        'doc_id': doc.id,
        'action_url': "",
        'preview_url': preview_url
    }

    return result