def import_documents(directory): files = [] for entry in os.scandir(directory): if entry.is_file(): file = (entry.path, entry.stat().st_mtime) files.append(file) else: logger.warning("Skipping %s as it is not a file", entry.path) if not files: return files_old_to_new = sorted(files, key=itemgetter(1)) time.sleep(int(settings.PAPERMERGE_FILES_MIN_UNMODIFIED_DURATION)) for file, mtime in files_old_to_new: if mtime == os.path.getmtime(file): # File has not been modified and can be consumed logger.info(f"Importing file {file}...") basename = os.path.basename(file) with tempfile.TemporaryDirectory() as tempdirname: shutil.move(file, tempdirname) temp_file_name = os.path.join(tempdirname, basename) logger.info(f"Same as temp_file_name={temp_file_name}...") imp = DocumentImporter(temp_file_name) imp.import_file()
def read_email_message(message): """ message is an instance of python's module email.message """ for index, part in enumerate(message.walk()): # search for payload maintype = part.get_content_maintype() subtype = part.get_content_subtype() logger.debug( f"IMAP import: payload {index} maintype={maintype}" f" subtype={subtype}." ) if is_payload_supported(maintype=maintype, subtype=subtype): logger.debug( f"IMAP import: importing..." ) with tempfile.NamedTemporaryFile() as temp: temp.write(part.get_payload(decode=True)) temp.flush() imp = DocumentImporter(temp.name) imp.import_file( delete_after_import=False ) else: logger.debug( f"IMAP import: ignoring payload." )
def test_import_file_with_title_arg(self): src_file_path = os.path.join(BASE_DIR, "data", "berlin.pdf") imp = DocumentImporter(src_file_path) if not imp.import_file( file_title="X1.pdf", delete_after_import=False, skip_ocr=True): self.assertTrue(False, "Error while importing file") self.assertEqual( Document.objects.filter(title="X1.pdf").count(), 1, "Document X1.pdf was not created.")
def put(self, request, filename): file_obj = request.data['file'] imp = DocumentImporter( file=file_obj.temporary_file_path(), username=request.user.username, ) doc = imp.import_file(file_title=filename, apply_async=True, delete_after_import=False) if isinstance(doc, Document): serializer = DocumentSerializer(doc) return Response(serializer.data) return Response(status=200)
def read_email_message(message): """ message is an instance of python's module email.message """ for part in message.walk(): # search for payload maintype = part.get_content_maintype() subtype = part.get_content_subtype() if maintype == 'application' and subtype == 'pdf': with tempfile.NamedTemporaryFile() as temp: temp.write(part.get_payload(decode=True)) temp.flush() imp = DocumentImporter(temp.name) imp.import_file(delete_after_import=False)
def test_delete_pages(self): # Create a document with two pages src_file_path = os.path.join( BASE_DIR, "data", "berlin.pdf" ) imp = DocumentImporter(src_file_path) if not imp.import_file( file_title="berlin.pdf", delete_after_import=False, skip_ocr=True ): self.assertTrue(False, "Error while importing file") doc = Document.objects.get(title="berlin.pdf") self.assertEqual( doc.page_count, 2 ) # initial version of any document is 0 self.assertEqual( doc.version, 0 ) doc.delete_pages( page_numbers=[1], skip_migration=True ) self.assertEqual( doc.page_count, 1 ) self.assertEqual( doc.pages.count(), 1 ) # version should have been incremented self.assertEqual( doc.version, 1 )