# collection policy fedoraLib.update_datastream(collection_object, u"COLLECTION_POLICY", "data/collection_policy.xml", label=u'COLLECTION_POLICY', mimeType=u'text/xml', controlGroup=u'X') # thumbnail, if one is supplied if tnUrl: tnExt = os.path.splitext(tnUrl)[1] fedoraLib.update_datastream(collection_object, u'TN', tnUrl, label=u"%s_TN%s" % (myLabel, tnExt), mimeType=misc.getMimeType(tnExt)) # rels-ext relations collection_relsext = createRelsExt(collection_object, parentPid, contentModel) return collection_object def addObjectToFedora(fedora, myLabel, myPid, parentPid, contentModel, tnUrl=None, extraNamespaces={},
return collection_object except FedoraConnectionException, fcx: if not fcx.httpcode in [404]: raise fcx # if it is a 404, then we're ok - just make the object and continue collection_object = fedora.createObject(myPid, label=myLabel) # this is the biggest difference between objects and collections - a collection policy # collection policy fedoraLib.update_datastream(collection_object, u"COLLECTION_POLICY", "data/collection_policy.xml", label=u'COLLECTION_POLICY', mimeType=u'text/xml', controlGroup=u'X') # thumbnail, if one is supplied if tnUrl: tnExt = os.path.splitext(tnUrl)[1] fedoraLib.update_datastream(collection_object, u'TN', tnUrl, label=u"%s_TN%s" % (myLabel, tnExt), mimeType=misc.getMimeType(tnExt)) # rels-ext relations collection_relsext = createRelsExt(collection_object, parentPid, contentModel) return collection_object def addObjectToFedora(fedora, myLabel, myPid, parentPid, contentModel, tnUrl=None, state=u'A', extraNamespaces={}, extraRelationships={}): """ Add an object (not a collection) to fedora @param fedora The fedora instance to add the object to @param myLabel The label to apply to the object object @param myPid The pid of the object to try and create, if the pid is already a valid object/collection, then return that object instead @parentPid The parent object to nest this one under @contentModel The content model to attach to this object @tnUrl [optional] The url of an image to use as the thumbnail
def addBookPageToFedora(inputTiff, tmpDir): ''' Helper function that handles adding and configuring a fedora object for a book page based on the input image and mods file do i need something separate to add a book collection obj? @param inputTiff: the archival data source @param tmpDir: file directory where non-archeival stuff gets put @return bool: true on function success false on fail ''' #run conversions converter.tif_to_jp2(inputTiff, tmpDir, 'default', 'default') converter.tif_OCR(inputTiff, tmpDir, {'PDF': 'default', 'Text': 'default'}) #determine page number: used for naming fullTiffDir = os.path.dirname(inputTiff) tifDir = os.path.basename(fullTiffDir) tiffName = os.path.basename(inputTiff) pageNumber = os.path.basename(inputTiff) pageNumber = int(pageNumber[0:pageNumber.index('_')]) #if front cover if tiffName.count('front_cover') == 1: pageNumber = 1 elif tiffName.count('inner_cover') == 1: pageNumber = 2 #if it's the inner leaf elif tiffName.count('inner_leaf') == 1: pageNumber = 3 #if back cover elif tiffName.count('back_cover') == 1: #get number of tiff files numberOfTiffs = 0 dir = os.path.dirname(inputTiff) for file in os.listdir(dir): if file[(len(file) - 4):len(file)] == '.tif' or file[( len(file) - 5):len(file)] == '.tiff': numberOfTiffs += 1 pageNumber = numberOfTiffs #standard a [left side] elif tiffName.count('a') == 1: if pageNumber == 1: pageNumber = 4 else: pageNumber = pageNumber * 2 + 2 #standard b [right side] elif tiffName.count('b') == 1: if pageNumber == 1: pageNumber = 5 else: pageNumber = pageNumber * 2 + 3 else: logging.error('Bad tiff file name: ' + inputTiff + ' giving fileNumber: ' + str(pageNumber) + '\n') return False logging.info('Working on ingest of page: ' + str(pageNumber) + ' with source file: ' + inputTiff) #create the fedora book page object pagePid = fedora.getNextPID(u'uofm') #pagePid = fedora.getNextPID(u'Awill') myLabel = unicode(tifDir + '_Page' + str(pageNumber)) obj = fedora.createObject(pagePid, label=myLabel) #create ingest urls if tiffName[(len(tiffName) - 4):len(tiffName)] == '.tif': tiffNameNoExt = tiffName[0:len(tiffName) - 4] tifExt = '.tif' if tiffName[(len(tiffName) - 5):len(tiffName)] == '.tiff': tiffNameNoExt = tiffName[0:len(tiffName) - 5] tifExt = '.tiff' baseInUrl = os.path.join(fullTiffDir, tiffNameNoExt) baseOutUrl = os.path.join(tmpDir, tiffNameNoExt) tiffUrl = open(baseInUrl + tifExt) jp2Url = open(baseOutUrl + '.jp2') pdfUrl = open(baseOutUrl + '.pdf') ocrUrl = open(baseOutUrl + '.txt') #this gets the metadata for the page from the tif exifPath = baseOutUrl + '.xml' converter.exif_to_xml(inputTiff, exifPath) exifUrl = open(exifPath) #this is used for creating the book pdf later global pagesDict pagesDict[pageNumber] = baseOutUrl + '.pdf' garbage = u'smelly' #tiff datastream try: obj.addDataStream(u'TIFF', garbage, label=u'TIFF', mimeType=u'image/tiff', controlGroup=u'M', logMessage=u'Added the archival tiff file.') logging.info('Added TIFF datastream to:' + pagePid) ds = obj['TIFF'] ds.setContent(tiffUrl) except FedoraConnectionException: logging.exception('Error in adding TIFF datastream to:' + pagePid + '\n') #jp2 datastream try: obj.addDataStream(u'JP2', garbage, label=u'JP2', mimeType=u'image/jp2', controlGroup=u'M', logMessage=u'Added jp2 image file.') logging.info('Added JP2 datastream to:' + pagePid) ds = obj['JP2'] ds.setContent(jp2Url) except FedoraConnectionException: logging.exception('Error in adding JP2 datastream to:' + pagePid + '\n') #pdf datastream try: obj.addDataStream(u'PDF', garbage, label=u'PDF', mimeType=u'application/pdf', controlGroup=u'M', logMessage=u'Added pdf with OCR.') logging.info('Added PDF datastream to:' + pagePid) ds = obj['PDF'] ds.setContent(pdfUrl) except FedoraConnectionException: logging.exception('Error in adding PDF datastream to:' + pagePid + '\n') #ocr datastream try: obj.addDataStream(u'OCR', garbage, label=u'OCR', mimeType=u'text/plain', controlGroup=u'M', logMessage=u'Added basic text of OCR.') logging.info('Added OCR datastream to:' + pagePid) ds = obj['OCR'] ds.setContent(ocrUrl) except FedoraConnectionException: logging.exception('Error in adding OCR Datastream to:' + pagePid + '\n') #exif datastream try: obj.addDataStream(u'EXIF', garbage, label=u'EXIF', mimeType=u'text/xml', controlGroup=u'M', logMessage=u'Added the archival EXIF file.') logging.info('Added EXIF datastream to:' + pagePid) ds = obj['EXIF'] ds.setContent(exifUrl) except FedoraConnectionException: logging.exception('Error in adding EXIF datastream to:' + pagePid + '\n') objRelsExt = fedora_relationships.rels_ext(obj, [ fedora_relationships.rels_namespace( 'pageNS', 'info:islandora/islandora-system:def/pageinfo#'), fedora_relationships.rels_namespace( 'fedora-model', 'info:fedora/fedora-system:def/model#') ]) objRelsExt.addRelationship('isMemberOf', bookPid) objRelsExt.addRelationship( fedora_relationships.rels_predicate('pageNS', 'isPageNumber'), fedora_relationships.rels_object( str(pageNumber), fedora_relationships.rels_object.LITERAL)) objRelsExt.addRelationship( fedora_relationships.rels_predicate('fedora-model', 'hasModel'), 'archiveorg:pageCModel') objRelsExt.update() #Dynamic Datastreams #grab all files that share a name with the tiff and do not use the already used extensions dynamicDSList = os.listdir(fullTiffDir) dynamicDSListCopy = list( dynamicDSList) #better than taking os.listdir twice for dynamicDSFile in dynamicDSListCopy: if dynamicDSFile[0:dynamicDSFile.find('.')]!=tiffNameNoExt or (dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.tif' or \ dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.tiff' or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.pdf' \ or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.jp2' or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.txt'\ or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.xml'):#@todo: refactor to use regualar expression module? dynamicDSList.remove(dynamicDSFile) #create the dynamic datastreams for dynamicDSFile in dynamicDSList: dynamicDSFileEXT = dynamicDSFile[dynamicDSFile.find('.') + 1:len(dynamicDSFile)] dynamicDSFileMimeType = misc.getMimeType(dynamicDSFileEXT) dynamicDSFileHandle = open(os.path.join(fullTiffDir, dynamicDSFile), 'r') try: obj.addDataStream(unicode(dynamicDSFileEXT), garbage, label=unicode(dynamicDSFileEXT), mimeType=unicode(dynamicDSFileMimeType), controlGroup=u'M', logMessage=unicode('Added the datastream:' + dynamicDSFileEXT)) logging.info('Added the datastream: ' + dynamicDSFileEXT + ' to: ' + pagePid) ds = obj[dynamicDSFileEXT] ds.setContent(dynamicDSFileHandle) except FedoraConnectionException: logging.exception('Error in adding' + dynamicDSFileEXT + 'datastream to:' + pagePid + '\n') return True
def addBookPageToFedora(inputTiff, tmpDir): ''' Helper function that handles adding and configuring a fedora object for a book page based on the input image and mods file do i need something separate to add a book collection obj? @param inputTiff: the archival data source @param tmpDir: file directory where non-archeival stuff gets put @return bool: true on function success false on fail ''' #run conversions converter.tif_to_jp2(inputTiff,tmpDir,'default','default') converter.tif_OCR(inputTiff,tmpDir,{'PDF':'default','Text':'default'}) #determine page number: used for naming fullTiffDir=os.path.dirname(inputTiff) tifDir=os.path.basename(fullTiffDir) tiffName=os.path.basename(inputTiff) pageNumber=os.path.basename(inputTiff) pageNumber=int(pageNumber[0:pageNumber.index('_')]) #if front cover if tiffName.count('front_cover')==1: pageNumber=1 elif tiffName.count('inner_cover')==1: pageNumber=2 #if it's the inner leaf elif tiffName.count('inner_leaf')==1: pageNumber=3 #if back cover elif tiffName.count('back_cover')==1: #get number of tiff files numberOfTiffs=0 dir=os.path.dirname(inputTiff) for file in os.listdir(dir): if file[(len(file)-4):len(file)]=='.tif' or file[(len(file)-5):len(file)]=='.tiff': numberOfTiffs+=1 pageNumber=numberOfTiffs #standard a [left side] elif tiffName.count('a')==1: if pageNumber==1: pageNumber=4 else: pageNumber=pageNumber*2+2 #standard b [right side] elif tiffName.count('b')==1: if pageNumber==1: pageNumber=5 else: pageNumber=pageNumber*2+3 else: logging.error('Bad tiff file name: '+inputTiff+' giving fileNumber: '+str(pageNumber)+'\n') return False logging.info('Working on ingest of page: '+str(pageNumber)+' with source file: '+inputTiff) #create the fedora book page object pagePid = fedora.getNextPID(u'uofm') #pagePid = fedora.getNextPID(u'Awill') myLabel=unicode(tifDir+'_Page'+str(pageNumber)) obj = fedora.createObject(pagePid, label=myLabel) #create ingest urls if tiffName[(len(tiffName)-4):len(tiffName)]=='.tif': tiffNameNoExt=tiffName[0:len(tiffName)-4] tifExt='.tif' if tiffName[(len(tiffName)-5):len(tiffName)]=='.tiff': tiffNameNoExt=tiffName[0:len(tiffName)-5] tifExt='.tiff' baseInUrl=os.path.join(fullTiffDir,tiffNameNoExt) baseOutUrl=os.path.join(tmpDir,tiffNameNoExt) tiffUrl=open(baseInUrl+tifExt) jp2Url=open(baseOutUrl+'.jp2') pdfUrl=open(baseOutUrl+'.pdf') ocrUrl=open(baseOutUrl+'.txt') #this gets the metadata for the page from the tif exifPath=baseOutUrl+'.xml' converter.exif_to_xml(inputTiff,exifPath) exifUrl= open(exifPath) #this is used for creating the book pdf later global pagesDict pagesDict[pageNumber]=baseOutUrl+'.pdf' garbage=u'smelly' #tiff datastream try: obj.addDataStream(u'TIFF', garbage, label=u'TIFF', mimeType=u'image/tiff', controlGroup=u'M', logMessage=u'Added the archival tiff file.') logging.info('Added TIFF datastream to:'+pagePid) ds=obj['TIFF'] ds.setContent(tiffUrl) except FedoraConnectionException: logging.exception('Error in adding TIFF datastream to:'+pagePid+'\n') #jp2 datastream try: obj.addDataStream(u'JP2',garbage, label=u'JP2', mimeType=u'image/jp2', controlGroup=u'M', logMessage=u'Added jp2 image file.') logging.info('Added JP2 datastream to:'+pagePid) ds=obj['JP2'] ds.setContent(jp2Url) except FedoraConnectionException: logging.exception('Error in adding JP2 datastream to:'+pagePid+'\n') #pdf datastream try: obj.addDataStream(u'PDF', garbage, label=u'PDF', mimeType=u'application/pdf', controlGroup=u'M', logMessage=u'Added pdf with OCR.') logging.info('Added PDF datastream to:'+pagePid) ds=obj['PDF'] ds.setContent(pdfUrl) except FedoraConnectionException: logging.exception('Error in adding PDF datastream to:'+pagePid+'\n') #ocr datastream try: obj.addDataStream(u'OCR', garbage, label=u'OCR', mimeType=u'text/plain', controlGroup=u'M', logMessage=u'Added basic text of OCR.') logging.info('Added OCR datastream to:'+pagePid) ds=obj['OCR'] ds.setContent(ocrUrl) except FedoraConnectionException: logging.exception('Error in adding OCR Datastream to:'+pagePid+'\n') #exif datastream try: obj.addDataStream(u'EXIF', garbage, label=u'EXIF', mimeType=u'text/xml', controlGroup=u'M', logMessage=u'Added the archival EXIF file.') logging.info('Added EXIF datastream to:'+pagePid) ds=obj['EXIF'] ds.setContent(exifUrl) except FedoraConnectionException: logging.exception('Error in adding EXIF datastream to:'+pagePid+'\n') objRelsExt=fedora_relationships.rels_ext(obj,[fedora_relationships.rels_namespace('pageNS','info:islandora/islandora-system:def/pageinfo#'), fedora_relationships.rels_namespace('fedora-model','info:fedora/fedora-system:def/model#')]) objRelsExt.addRelationship('isMemberOf',bookPid) objRelsExt.addRelationship(fedora_relationships.rels_predicate('pageNS','isPageNumber'),fedora_relationships.rels_object(str(pageNumber),fedora_relationships.rels_object.LITERAL)) objRelsExt.addRelationship(fedora_relationships.rels_predicate('fedora-model','hasModel'),'archiveorg:pageCModel') objRelsExt.update() #Dynamic Datastreams #grab all files that share a name with the tiff and do not use the already used extensions dynamicDSList=os.listdir(fullTiffDir) dynamicDSListCopy=list(dynamicDSList)#better than taking os.listdir twice for dynamicDSFile in dynamicDSListCopy: if dynamicDSFile[0:dynamicDSFile.find('.')]!=tiffNameNoExt or (dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.tif' or \ dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.tiff' or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.pdf' \ or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.jp2' or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.txt'\ or dynamicDSFile[dynamicDSFile.find('.'):len(dynamicDSFile)]=='.xml'):#@todo: refactor to use regualar expression module? dynamicDSList.remove(dynamicDSFile) #create the dynamic datastreams for dynamicDSFile in dynamicDSList: dynamicDSFileEXT=dynamicDSFile[dynamicDSFile.find('.')+1:len(dynamicDSFile)] dynamicDSFileMimeType=misc.getMimeType(dynamicDSFileEXT) dynamicDSFileHandle=open(os.path.join(fullTiffDir,dynamicDSFile),'r') try: obj.addDataStream(unicode(dynamicDSFileEXT), garbage, label=unicode(dynamicDSFileEXT), mimeType=unicode(dynamicDSFileMimeType), controlGroup=u'M', logMessage=unicode('Added the datastream:'+dynamicDSFileEXT)) logging.info('Added the datastream: '+dynamicDSFileEXT+' to: '+pagePid) ds=obj[dynamicDSFileEXT] ds.setContent(dynamicDSFileHandle) except FedoraConnectionException: logging.exception('Error in adding'+ dynamicDSFileEXT +'datastream to:'+pagePid+'\n') return True