def createBookPDF(bookPath): ''' This function creates the pdf of an entire book and ingests it as a DS into fedora @param pagesDict: the dictionary containing as keys the page number and as values the file path @param bookPid: the pid of the book object to add the pdf datastream to @return bool: true if added false if not ''' #get page to bookPath = os.path.join(bookPath, os.path.basename(bookPath) + '.pdf') pageNum = 1 while pageNum <= len(pagesDict): pagePath = pagesDict[pageNum] fileManipulator.appendPDFwithPDF(bookPath, pagePath) pageNum += 1 #create and add pdf datastream obj = fedora.getObject(bookPid) bookFile = open(bookPath, 'rb') garbage = 'smelly' try: obj.addDataStream(u'PDF', garbage, label=u'PDF', mimeType=u'application/pdf', controlGroup=u'M', logMessage=u'Added pdf with OCR.') logging.info('Added PDF datastream to:' + bookPid) ds = obj['PDF'] ds.setContent(bookFile) except FedoraConnectionException: logging.exception('Error in adding PDF datastream to:' + bookPid + '\n') return False return True
def createBookPDF(bookPath): ''' This function creates the pdf of an entire book and ingests it as a DS into fedora @param pagesDict: the dictionary containing as keys the page number and as values the file path @param bookPid: the pid of the book object to add the pdf datastream to @return bool: true if added false if not ''' #get page to bookPath=os.path.join(bookPath,os.path.basename(bookPath)+'.pdf') pageNum=1 while pageNum<=len(pagesDict): pagePath=pagesDict[pageNum] fileManipulator.appendPDFwithPDF(bookPath, pagePath) pageNum+=1 #create and add pdf datastream obj = fedora.getObject(bookPid) bookFile=open(bookPath,'rb') garbage='smelly' try: obj.addDataStream(u'PDF', garbage, label=u'PDF', mimeType=u'application/pdf', controlGroup=u'M', logMessage=u'Added pdf with OCR.') logging.info('Added PDF datastream to:'+bookPid) ds=obj['PDF'] ds.setContent(bookFile) except FedoraConnectionException: logging.exception('Error in adding PDF datastream to:'+bookPid+'\n') return False return True
os.remove(jp2File) # finished with that # create DC, MODS, VRA datastreams for dsid in ['DC', 'MODS', 'VRA']: dsfile = os.path.join(bookFolder, "%s.%s.xml" % (os.path.splitext(page)[0], dsid.lower())) dspage = os.path.basename(dsfile) fedoraLib.update_datastream(obj, unicode(dsid), dsfile, label=unicode(dspage), mimeType=misc.getMimeType("xml"), controlGroup='X') pdfFile = os.path.join(config.tempDir, "%s.pdf" % basePage) converter.tif_to_pdf(tifFile, pdfFile, 'default') #fedoraLib.update_datastream(obj, u'PDF', pdfFile, label=unicode("%s.pdf" % basePage), mimeType=misc.getMimeType("pdf")) # for the first page, move it to the full when finished with it if idx == 0: os.rename(pdfFile, fullPDF) # for every other page (>1), append it to fullPDF and delete the original else: manipulator.appendPDFwithPDF(fullPDF, pdfFile) os.remove(pdfFile) sys.stdout.flush() sys.stderr.flush() # ingest the full PDF on the master book object # and delete it if not config.dryrun: print("Ingesting full PDF document") fedoraLib.update_datastream(bookObj, u"PDF", fullPDF, label=os.path.basename(fullPDF), mimeType=misc.getMimeType("pdf")) os.remove(fullPDF) return True