Exemplo n.º 1
0
    def __init__(self, solr_server, collection):
        """
        Constructor to initialise solr client API URL

        @type       solr_server: SolrServer
        @param      solr_server: Solr server

        @type       collection: string
        @param      collection: Collection identifier, e.g. "samplecollection"
        """
        base_url = solr_server.get_base_url()
        if base_url[-1] != '/':
            base_url += '/'
        self.url = base_url + collection
        self.ffid = FormatIdentification()
Exemplo n.º 2
0
class SIPGenerator(object):
    fid = FormatIdentification()
    mime = MimeTypes()
    root_path = ""

    def __init__(self, root_path):
        print "Working in rootdir %s" % root_path
        self.root_path = root_path

    def sha256(self, fname):
        hash = hashlib.sha256()
        with open(fname) as f:
            for chunk in iter(lambda: f.read(4096), ""):
                hash.update(chunk)
        return hash.hexdigest()

    def createAgent(self, role, type, other_type, name, note):
        if other_type:
            agent = M.agent(
                {
                    "ROLE": role,
                    "TYPE": type,
                    "OTHERTYPE": other_type
                }, M.name(name), M.note(note))
        else:
            agent = M.agent({
                "ROLE": role,
                "TYPE": type
            }, M.name(name), M.note(note))
        return agent

    def runCommand(self, program, stdin=PIPE, stdout=PIPE, stderr=PIPE):
        result, res_stdout, res_stderr = None, None, None
        try:
            # quote the executable otherwise we run into troubles
            # when the path contains spaces and additional arguments
            # are presented as well.
            # special: invoking bash as login shell here with
            # an unquoted command does not execute /etc/profile

            print 'Launching: ' + ' '.join(program)
            process = Popen(program,
                            stdin=stdin,
                            stdout=stdout,
                            stderr=stderr,
                            shell=False)

            res_stdout, res_stderr = process.communicate()
            result = process.returncode
            print 'Finished: ' + ' '.join(program)

        except Exception as ex:
            res_stderr = ''.join(str(ex.args))
            result = 1

        if result != 0:
            print 'Command failed:' + ''.join(res_stderr)
            raise Exception('Command failed:' + ''.join(res_stderr))

        return result, res_stdout, res_stderr

    def addFile(self, file_name, mets_filegroup):
        #reload(sys)
        #sys.setdefaultencoding('utf8')
        file_url = "file://./%s" % os.path.relpath(file_name, self.root_path)
        file_mimetype, _ = self.mime.guess_type(file_url)
        file_checksum = self.sha256(file_name)
        file_size = os.path.getsize(file_name)
        file_cdate = get_file_ctime_iso_date_str(file_name,
                                                 DT_ISO_FMT_SEC_PREC)
        file_id = "ID" + uuid.uuid4().__str__()
        mets_file = M.file({
            "MIMETYPE": file_mimetype,
            "CHECKSUMTYPE": "SHA-256",
            "CREATED": file_cdate,
            "CHECKSUM": file_checksum,
            "USE": "Datafile",
            "ID": file_id,
            "SIZE": file_size
        })
        mets_filegroup.append(mets_file)
        #_,fname = os.path.split(file_name)
        mets_FLocat = M.FLocat({
            q(XLINK_NS, 'href'): file_url,
            "LOCTYPE": "URL",
            q(XLINK_NS, 'type'): 'simple'
        })
        mets_file.append(mets_FLocat)
        return file_id

    def addFiles(self, folder, mets_filegroup):
        ids = []
        for top, dirs, files in os.walk(folder):
            for fn in files:
                file_name = os.path.join(top, fn)
                file_id = self.addFile(file_name, mets_filegroup)
                ids.append(file_id)
        return ids

    # def createPremis(self, enable_jhove = False):
    #     jhove_parser = None
    #     if enable_jhove == True:
    #         jhove_parser = etree.XMLParser(remove_blank_text=True)
    #
    #     PREMIS_ATTRIBUTES = {"version" : "2.0"}
    #     premis = P.premis(PREMIS_ATTRIBUTES)
    #     premis.attrib['{%s}schemaLocation' % XSI_NS] = "info:lc/xmlns/premis-v2 ../../schemas/premis-v2-2.xsd"
    #
    #     premis_ids = []
    #     for top, dirs, files in os.walk(os.path.join(self.root_path, 'data')):
    #         for nm in files:
    #             file_name = os.path.join(top,nm)
    #             hash = self.sha256(file_name)
    #             file_url = "file://./%s" % os.path.relpath(file_name, self.root_path)
    #             fmt = self.fid.identify_file(file_name)#os.path.abspath(remove_protocol(file_url)))
    #             jhove = None
    #             if enable_jhove == True:
    #                 try:
    #                     result = self.runCommand(["/usr/bin/jhove", "-h", "xml", file_name] )
    #                     if result[0] == 0:
    #                         jhove = etree.XML(result[1], parser=jhove_parser)
    #                 except Exception:
    #                     #TODO: handle exception
    #                     pass
    #
    #             size = os.path.getsize(file_name)
    #             premis_id = uuid.uuid4()
    #             premis_ids.append(premis_id)
    #             premis.append(
    #                 P.object(
    #                     {q(XSI_NS, 'type'): 'file', "xmlID":premis_id},
    #                     P.objectIdentifier(
    #                         P.objectIdentifierType('LOCAL'),
    #                         P.objectIdentifierValue(premis_id)
    #                     ),
    #                     P.objectIdentifier(
    #                         P.objectIdentifierType('FILEPATH'),
    #                         P.objectIdentifierValue(file_url)
    #                     ),
    #                     P.objectCharacteristics(
    #                         P.compositionLevel(0),
    #                         P.size(size),
    #                         P.fixity(
    #                             P.messageDigestAlgorithm("SHA-256"),
    #                             P.messageDigest(hash),
    #                             P.messageDigestOriginator("hashlib")
    #                         ),
    #                         P.format(
    #                             P.formatRegistry(
    #                                 P.formatRegistryName("PRONOM"),
    #                                 P.formatRegistryKey(fmt),
    #                                 P.formatRegistryRole("identification")
    #                             )
    #                         ),
    #                         #P.objectCharacteristicsExtension(
    #                             #TODO:// generate id or reference from somewhere
    #                         #    P.mdSec({"ID":"ID426087e8-0f79-11e3-847a-34e6d700c47b"},
    #                         #        P.mdWrap({"MDTYPE":"OTHER", "OTHERMDTYPE":"JHOVE"},
    #                         #            P.xmlData(
    #                         #                jhove
    #                         #                 )
    #                         #                 )
    #                         #)
    #                     ),
    #                 )
    #             )
    #
    #     identifier_value = 'earkweb'
    #     premis.append(P.agent(
    #             P.agentIdentifier(
    #                 P.agentIdentifierType('LOCAL'),
    #                 P.agentIdentifierValue(identifier_value)
    #             ),
    #             P.agentName('E-ARK AIP to DIP Converter'),
    #             P.agentType('Software')))
    #
    #     identifier_value = 'AIP Creation'
    #     linking_agent = 'earkweb'
    #     linking_object=None
    #     premis.append(P.event(
    #             P.eventIdentifier(
    #                 P.eventIdentifierType('LOCAL'),
    #                 P.eventIdentifierValue(identifier_value)
    #             ),
    #             P.eventType,
    #             P.eventDateTime(current_timestamp()),
    #             P.linkingAgentIdentifier(
    #                 P.linkingAgentIdentifierType('LOCAL'),
    #                 P.linkingAgentIdentifierValue(linking_agent)
    #             ),
    #
    #             P.linkingAgentIdentifier(
    #                 P.linkingAgentIdentifierType('LOCAL'),
    #                 P.linkingAgentIdentifierValue(linking_object)
    #             )
    #             if linking_object is not None else None
    #         ))
    #
    #     str = etree.tostring(premis, encoding='UTF-8', pretty_print=True, xml_declaration=True)
    #     preservation_dir = os.path.join(self.root_path, './metadata/preservation')
    #     if not os.path.exists(preservation_dir):
    #         os.mkdir(preservation_dir)
    #     path_premis = os.path.join(self.root_path, './metadata/preservation/premis.xml')
    #     with open(path_premis, 'w') as output_file:
    #         output_file.write(str)
    #
    #     return premis_ids

    # def createSIPParentMets(self, mets_files):
    #     #create METS skeleton
    #     METS_ATTRIBUTES = {"OBJID" : "", "TYPE" : "", "LABEL" : "", "PROFILE" : "http://www.ra.ee/METS/v01/IP.xml", "ID" : "" }
    #     root = M.mets(METS_ATTRIBUTES)
    #     root.attrib['{%s}schemaLocation' % XSI_NS] = "http://www.loc.gov/METS/ schemas/IP.xsd ExtensionMETS schemas/ExtensionMETS.xsd http://www.w3.org/1999/xlink schemas/xlink.xsd"
    #
    #     mets_hdr = M.metsHdr({"CREATEDATE": current_timestamp(), q(METSEXT_NS,"OAISSTATUS") :"", "RECORDSTATUS" :""})
    #     root.append(mets_hdr)
    #
    #
    #     mets_hdr.append(self.createAgent("ARCHIVIST", "ORGANIZATION", "" ,"Institution", "Note"))
    #     mets_hdr.append(self.createAgent("ARCHIVIST", "OTHER", "" ,"Institution", "Note"))
    #     mets_hdr.append(self.createAgent("CREATOR", "ORGANIZATION", "", "Institution", "Note"))
    #     mets_hdr.append(self.createAgent("CREATOR", "OTHER", "SOFTWARE", "E-ARK SIP Creator", "VERSION=0.0.1"))
    #     mets_hdr.append(self.createAgent("PRESERVATION", "ORGANIZATION", "", "Institution", "Note"))
    #     mets_hdr.append(M.metsDocumentID("METS.xml"))
    #
    #     mets_dmd = M.dmdSec({"ID":""})
    #     root.append(mets_dmd)
    #
    #     mets_amdSec = M.amdSec({"ID":"ID" + uuid.uuid1().__str__()})
    #     root.append(mets_amdSec)
    #
    #     mets_techmd = M.techMD({"ID":"ID" + uuid.uuid1().__str__()})
    #     mets_amdSec.append(mets_techmd)
    #     #for id in premis_ids:
    #     #    mets_mdref = M.mdRef({"LOCTYPE":"URL", "MDTYPE":"PREMIS:OBJECT", q(XLINK_NS,"href"):"file://./metadata/preservation/PREMIS.xml#"+id.__str__()})
    #     #    mets_techmd.append(mets_mdref)
    #
    #     mets_fileSec = M.fileSec()
    #     root.append(mets_fileSec)
    #
    #     mets_filegroup = M.fileGrp({"ID": "ID" + uuid.uuid1().__str__()})
    #     mets_fileSec.append(mets_filegroup)
    #
    #     #content_ids = self.addFiles(os.path.join(self.root_path, 'data'), mets_filegroup)
    #     #metadata_ids = self.addFiles(os.path.join(self.root_path, 'metadata'), mets_filegroup)
    #
    #     #mets_ids = []
    #     #for file in mets_files:
    #     #    print file
    #     #    mets_id = self.addFile(file, mets_filegroup)
    #     #    mets_ids.append(mets_id)
    #
    #
    #     # checking for cross-representation metadata:
    #     #package_root = self.root_path.rsplit('/', 2)
    #     #package_metadata = os.path.join(package_root[0], 'metadata')
    #     #ext_metadata_ids = self.addFiles(package_metadata, mets_filegroup)
    #
    #     mets_structmap = M.structMap({"ID": "", "TYPE":"", "LABEL":"Simple grouping"})
    #     root.append(mets_structmap)
    #
    #     mets_structmap_div = M.div({"ADMID":"", "LABEL":"Package", "DMDID" : ""})
    #     mets_structmap.append(mets_structmap_div)
    #
    #     mets_structmap_mets_div = M.div({"LABEL":"Representation Metadata"})
    #     mets_structmap_div.append(mets_structmap_mets_div)
    #
    #     for file in mets_files:
    #         file_url = "file://./%s" % os.path.relpath(file, self.root_path)
    #         mptr = M.mptr({"LOCTYPE": "URL", q(XLINK_NS, 'href'): file_url })
    #         mets_structmap_mets_div.append(mptr)
    #
    #     #for id in mets_ids:
    #     #    mptr = M.mptr({"FILEID": id})
    #     #    mets_structmap_mets_div.append(mptr)
    #     #    #fptr = M.fptr({"FILEID": id})
    #     #    #mets_structmap_mets_div.append(fptr)
    #
    #     #mets_structmap_content_div = M.div({"LABEL":"Content"})
    #     #mets_structmap_div.append(mets_structmap_content_div)
    #     #for id in content_ids:
    #     #    fptr = M.fptr({"FILEID": id})
    #     #    mets_structmap_content_div.append(fptr)
    #
    #     #mets_structmap_metadata_div = M.div({"LABEL":"Metadata"})
    #     #mets_structmap_div.append(mets_structmap_metadata_div)
    #     #for id in metadata_ids:
    #     #    fptr = M.fptr({"FILEID": id})
    #     #    mets_structmap_metadata_div.append(fptr)
    #     #for id in ext_metadata_ids:
    #     #    fptr = M.fptr({"FILEID": id})
    #     #    mets_structmap_metadata_div.append(fptr)
    #
    #     #my_mets.fileSec.append(M.fileGrp({'USE': 'submission'}))
    #     #my_mets.fileSec(M.fileGrp({'USE': 'submission'}))
    #     #mets_schema_file = os.path.join(root_dir, "sandbox/sipgenerator/resources/ENA_RK_TartuLV_141127/schemas/IP.xsd")
    #     #mets_schema = etree.parse(mets_schema_file)
    #     #mets_xsd = etree.XMLSchema(mets_schema)
    #
    #     str = etree.tostring(root, encoding='UTF-8', pretty_print=True, xml_declaration=True)
    #
    #     path_mets = os.path.join(self.root_path,'METS.xml')
    #     with open(path_mets, 'w') as output_file:
    #         output_file.write(str)

    # def createSIPMets(self, premis_ids = None):
    #     if premis_ids == None:
    #         premis_ids = self.createPremis()
    #
    #     #create METS skeleton
    #     # METS_ATTRIBUTES = {"OBJID" : "", "TYPE" : "", "LABEL" : "", "PROFILE" : "http://www.ra.ee/METS/v01/IP.xml", "ID" : "" }
    #     METS_ATTRIBUTES = {"OBJID": "placeholder123",
    #                        "LABEL": "METS file describing the SIP matching the OBJID.",
    #                        "PROFILE": "http://www.ra.ee/METS/v01/IP.xml",
    #                        "TYPE": "SIP"}
    #     root = M.mets(METS_ATTRIBUTES)
    #     root.attrib['{%s}schemaLocation' % XSI_NS] = "http://www.loc.gov/METS/ schemas/IP.xsd ExtensionMETS schemas/ExtensionMETS.xsd http://www.w3.org/1999/xlink schemas/xlink.xsd"
    #
    #     # mets_hdr = M.metsHdr({"CREATEDATE": current_timestamp(), q(METSEXT_NS,"OAISSTATUS") :"", "RECORDSTATUS" :""})
    #     mets_hdr = M.metsHdr({"CREATEDATE": current_timestamp(), "RECORDSTATUS" :"NEW"})
    #     root.append(mets_hdr)
    #
    #     mets_hdr.append(self.createAgent("ARCHIVIST", "ORGANIZATION", "" ,"Institution", "Note"))
    #     mets_hdr.append(self.createAgent("ARCHIVIST", "OTHER", "" ,"Institution", "Note"))
    #     mets_hdr.append(self.createAgent("CREATOR", "ORGANIZATION", "", "Institution", "Note"))
    #     mets_hdr.append(self.createAgent("CREATOR", "OTHER", "SOFTWARE", "E-ARK SIP Creator", "VERSION=0.0.1"))
    #     mets_hdr.append(self.createAgent("PRESERVATION", "ORGANIZATION", "", "Institution", "Note"))
    #     mets_hdr.append(M.metsDocumentID("METS.xml"))
    #
    #     mets_dmd = M.dmdSec({"ID": "ID" + uuid.uuid4().__str__()})
    #     root.append(mets_dmd)
    #     # this is how to add descriptive metadata entry
    #     #file_name = "../schemas/ead.xml"
    #     #file_url = unicode(os.path.join("file://",file_name), "utf-8")
    #     #checksum = self.sha256(file_name)
    #     #file_size = os.path.getsize(file_name)
    #     #mets_mdref= M.mdRef({"LOCTYPE":"URL", "MDTYPE":"EAD", "MIMETYPE":"text/xml", "CREATED":current_timestamp(), q(XLINK_NS,"type"):"simple", q(XLINK_NS,"href"):file_url, "CHECKSUMTYPE":"SHA-256", "CHECKSUM":file_checksum, "SIZE":file_size})
    #     #mets_dmd.append(mets_mdref)
    #
    #     mets_amdSec = M.amdSec({"ID":"ID" + uuid.uuid4().__str__()})
    #     root.append(mets_amdSec)
    #
    #     mets_techmd = M.techMD({"ID":"ID" + uuid.uuid4().__str__()})
    #     mets_amdSec.append(mets_techmd)
    #     #for id in premis_ids:
    #     #    mets_mdref = M.mdRef({"LOCTYPE":"URL", "MDTYPE":"PREMIS:OBJECT", q(XLINK_NS,"href"):"file://./metadata/preservation/PREMIS.xml#"+id.__str__()})
    #     #    mets_techmd.append(mets_mdref)
    #     #techmd_ids = self.addFiles(os.path.join(self.root_path, 'metadata/preservation'))
    #
    #
    #     mets_fileSec = M.fileSec()
    #     root.append(mets_fileSec)
    #
    #     # general filegroup
    #     mets_filegroup = M.fileGrp({"ID": "ID" + uuid.uuid4().__str__(), "USE": "general filegroup"})
    #     mets_fileSec.append(mets_filegroup)
    #
    #     # data, metadata and schemas - package level
    #     content_ids = self.addFiles(os.path.join(self.root_path, 'data'), mets_filegroup)
    #     metadata_ids = self.addFiles(os.path.join(self.root_path, 'metadata'), mets_filegroup)
    #     schema_ids = self.addFiles(os.path.join(self.root_path, 'schemas'), mets_filegroup)
    #
    #     # checking for cross-representation metadata and schemas:
    #     package_root = self.root_path.rsplit('/', 2)
    #     package_metadata = os.path.join(package_root[0], 'metadata')
    #     cross_metadata_ids = self.addFiles(package_metadata, mets_filegroup)
    #     package_schemas = os.path.join(package_root[0], 'schemas')
    #     cross_schemas_ids = self.addFiles(package_schemas, mets_filegroup)
    #
    #     # mets_structmap = M.structMap({"ID": "", "TYPE":"", "LABEL":"Simple grouping"})
    #     mets_structmap = M.structMap({"LABEL": "Simple SIP structuring", "TYPE": "logical"})
    #     root.append(mets_structmap)
    #
    #     mets_structmap_div = M.div({"ADMID":"", "LABEL":"Package", "DMDID" : ""})
    #     mets_structmap.append(mets_structmap_div)
    #
    #     # files in /data
    #     mets_structmap_content_div = M.div({"LABEL":"Content"})
    #     mets_structmap_div.append(mets_structmap_content_div)
    #     for id in content_ids:
    #         fptr = M.fptr({"FILEID": id})
    #         mets_structmap_content_div.append(fptr)
    #
    #     # files in /metadata and ./../../metadata
    #     mets_structmap_metadata_div = M.div({"LABEL":"Metadata"})
    #     mets_structmap_div.append(mets_structmap_metadata_div)
    #     for id in metadata_ids:
    #         fptr = M.fptr({"FILEID": id})
    #         mets_structmap_metadata_div.append(fptr)
    #     for id in cross_metadata_ids:
    #         fptr = M.fptr({"FILEID": id})
    #         mets_structmap_metadata_div.append(fptr)
    #
    #     # files in /schemas and ./../../schemas
    #     mets_structmap_schemas_div = M.div({"LABEL": "Schemas"})
    #     mets_structmap_div.append(mets_structmap_schemas_div)
    #     for id in schema_ids:
    #         fptr = M.fptr({"FILEID": id})
    #         mets_structmap_schemas_div.append(fptr)
    #     for id in cross_schemas_ids:
    #         fptr = M.fptr({"FILEID": id})
    #         mets_structmap_schemas_div.append(fptr)
    #
    #     #my_mets.fileSec.append(M.fileGrp({'USE': 'submission'}))
    #     #my_mets.fileSec(M.fileGrp({'USE': 'submission'}))
    #     #mets_schema_file = os.path.join(root_dir, "sandbox/sipgenerator/resources/ENA_RK_TartuLV_141127/schemas/IP.xsd")
    #     #mets_schema = etree.parse(mets_schema_file)
    #     #mets_xsd = etree.XMLSchema(mets_schema)
    #
    #     str = etree.tostring(root, encoding='UTF-8', pretty_print=True, xml_declaration=True)
    #
    #     path_mets = os.path.join(self.root_path,'METS.xml')
    #     with open(path_mets, 'w') as output_file:
    #         output_file.write(str)

    #def createAIPMets(self, premis_ids=None):
    # def createAIPMets(self, packageid):
    #     '''
    #     Create the AIP METS file.
    #
    #     @param premis_ids:
    #     @return:
    #     '''
    #     #premis_ids = None
    #
    #     #if premis_ids == None:
    #     #    premis_ids = self.createPremis()
    #
    #     # TODO: add metadata files to correct metadata sections!
    #
    #     # create METS skeleton
    #     METS_ATTRIBUTES = {"OBJID": packageid,
    #                        "LABEL": "METS file describing the AIP matching the OBJID.",
    #                        "PROFILE": "http://www.ra.ee/METS/v01/IP.xml",
    #                        "TYPE": "AIP"}
    #     root = M.mets(METS_ATTRIBUTES)
    #     root.attrib['{%s}schemaLocation' % XSI_NS] = "http://www.loc.gov/METS/ schemas/IP.xsd ExtensionMETS schemas/ExtensionMETS.xsd http://www.w3.org/1999/xlink schemas/xlink.xsd"
    #
    #     # mets_hdr = M.metsHdr({"CREATEDATE": current_timestamp(), q(METSEXT_NS,"OAISSTATUS") :"", "RECORDSTATUS" :"AIP"})
    #     mets_hdr = M.metsHdr({"CREATEDATE": current_timestamp(), "RECORDSTATUS" :"NEW"})
    #     root.append(mets_hdr)
    #
    #     mets_hdr.append(self.createAgent("ARCHIVIST", "ORGANIZATION", "" ,"Institution", "Note"))
    #     mets_hdr.append(self.createAgent("ARCHIVIST", "OTHER", "" ,"Institution", "Note"))
    #     mets_hdr.append(self.createAgent("CREATOR", "ORGANIZATION", "", "Institution", "Note"))
    #     mets_hdr.append(self.createAgent("CREATOR", "OTHER", "SOFTWARE", "E-ARK SIP Creator", "VERSION=0.0.1"))
    #     mets_hdr.append(self.createAgent("PRESERVATION", "ORGANIZATION", "", "Institution", "Note"))
    #     mets_hdr.append(M.metsDocumentID("METS.xml"))
    #
    #     mets_dmd = M.dmdSec({"ID": "ID" + uuid.uuid4().__str__()})
    #     root.append(mets_dmd)
    #     # this is how to add descriptive metadata entry
    #     #file_name = "../schemas/ead.xml"
    #     #file_url = unicode(os.path.join("file://",file_name), "utf-8")
    #     #checksum = self.sha256(file_name)
    #     #file_size = os.path.getsize(file_name)
    #     #mets_mdref= M.mdRef({"LOCTYPE":"URL", "MDTYPE":"EAD", "MIMETYPE":"text/xml", "CREATED":current_timestamp(), q(XLINK_NS,"type"):"simple", q(XLINK_NS,"href"):file_url, "CHECKSUMTYPE":"SHA-256", "CHECKSUM":file_checksum, "SIZE":file_size})
    #     #mets_dmd.append(mets_mdref)
    #
    #     # create amdSec
    #     mets_amdSec = M.amdSec({"ID": "ID" + uuid.uuid4().__str__()})
    #     root.append(mets_amdSec)
    #
    #     # create techMD
    #     #mets_techmd = M.techMD({"ID": "ID" + uuid.uuid4().__str__()})
    #     #mets_amdSec.append(mets_techmd)
    #     #for id in premis_ids:
    #     #    mets_mdref = M.mdRef({"LOCTYPE":"URL", "MDTYPE":"PREMIS:OBJECT", q(XLINK_NS,"href"):"file://./metadata/preservation/PREMIS.xml#"+id.__str__()})
    #     #    mets_techmd.append(mets_mdref)
    #
    #     mets_fileSec = M.fileSec()
    #     root.append(mets_fileSec)
    #
    #     # TODO: metadata entries into correct mets section + structmap: seperate between AIP and rep level metadata!
    #     # check if there is another METS file referencing the metadata
    #
    #     # general filegroup
    #     mets_filegroup = M.fileGrp({"ID": "ID" + uuid.uuid4().__str__(), "USE": "general filegroup"})
    #     mets_fileSec.append(mets_filegroup)
    #
    #     #metadata_ids = self.addFiles(os.path.join(self.root_path, 'metadata'), mets_filegroup)
    #     #submission_meta_ids = self.addFiles(os.path.join(self.root_path, 'submission/metadata'), mets_filegroup)
    #
    #     # mets_structmap = M.structMap({"ID": "", "TYPE":"", "LABEL":"Simple grouping"})
    #     mets_structmap = M.structMap({"LABEL": "Simple AIP structuring", "TYPE": "logical"})
    #     root.append(mets_structmap)
    #
    #     mets_structmap_div = M.div({"ADMID":"", "LABEL":"Package", "DMDID" : ""})
    #     mets_structmap.append(mets_structmap_div)
    #
    #     # metadata structmap - IP root level!
    #     mets_structmap_metadata_div = M.div({"LABEL": "Metadata"})
    #     mets_structmap_div.append(mets_structmap_metadata_div)
    #     #for id in metadata_ids:
    #     #    fptr = M.fptr({"FILEID": id})
    #     #    mets_structmap_metadata_div.append(fptr)
    #
    #     # metadata structmap - submission level!
    #     #mets_structmap_metadata_sub_div = M.div({"LABEL": "Metadata Submission"})
    #     #mets_structmap_div.append(mets_structmap_metadata_sub_div)
    #     #for id in submission_meta_ids:
    #     #    fptr = M.fptr({"FILEID": id})
    #     #    mets_structmap_metadata_sub_div.append(fptr)
    #
    #     # content structmap - all representations! (is only filled if no separate METS exists for the rep)
    #     mets_structmap_content_div = M.div({"LABEL": "various files"})
    #     mets_structmap_div.append(mets_structmap_content_div)
    #
    #     # structmap for schema files
    #     mets_structmap_schema_div = M.div({"LABEL": "schema files"})
    #     mets_structmap_div.append(mets_structmap_schema_div)
    #
    #     # create structmap for representations
    #     # mets_structmap_reps = M.structMap({"ID": "", "TYPE":"", "LABEL":"representations"})
    #     mets_structmap_reps = M.structMap({"TYPE":"logical", "LABEL":"representations"})
    #     root.append(mets_structmap_reps)
    #     mets_div_reps = M.div({'LABEL': 'representations', 'TYPE': 'type'})
    #     mets_structmap_reps.append(mets_div_reps)
    #
    #     # package
    #     workdir_length = len(self.root_path)
    #     #for directory, subdirectories, filenames in os.walk(os.path.join(self.root_path, 'submission/representations')):
    #     for directory, subdirectories, filenames in os.walk(self.root_path):
    #         if directory == os.path.join(self.root_path, 'metadata/earkweb'):
    #             del directory
    #             del filenames[:]
    #             del subdirectories[:]
    #         if len(filenames) > 0:
    #             for filename in filenames:
    #                 # ignore all files on AIP root level (since they wont be packed with the AIP):
    #                 if directory == self.root_path:
    #                     del filename
    #                 else:
    #                     # TODO: list rep metadata only in the rep Mets?
    #                     rel_path_file = ('file://.' + directory[workdir_length:] + '/' + filename).decode('utf-8')
    #                     if filename.lower() == 'mets.xml':
    #                         # delete the subdirectories list to stop os.walk from traversing further;
    #                         # mets file should be added as <mets:mptr> to <structMap> for corresponding rep
    #                         del subdirectories[:]
    #                         rep_name = directory.rsplit('/', 2)
    #                         rep_name = os.path.join(rep_name[1], rep_name[2])
    #                         # create structMap div and append to representations structMap
    #                         mets_structmap_rep_div = M.div({"LABEL": rep_name, "TYPE": "representation mets", "ID": "ID" + uuid.uuid4().__str__()})
    #                         mets_div_reps.append(mets_structmap_rep_div)
    #                         # add mets file as <mets:mptr>
    #                         metspointer = M.mptr({"LOCTYPE": "URL",
    #                                               q(XLINK_NS,"title"): "mets file describing representation: " + rep_name + " of AIP: " + packageid,
    #                                               q(XLINK_NS,"href"): rel_path_file})
    #                                              #"ID": uuid.uuid4().__str__()})
    #                         mets_structmap_rep_div.append(metspointer)
    #                         # also add the rep mets to the filegroup, so we can have a fptr
    #                         id = self.addFile(os.path.join(directory, filename), mets_filegroup)
    #                         mets_fptr = M.fptr({"FILEID": id})
    #                         mets_structmap_rep_div.append(mets_fptr)
    #                     elif filename and directory.endswith('schemas'):
    #                         # schema files
    #                         id = self.addFile(os.path.join(directory, filename), mets_filegroup)
    #                         fptr = M.fptr({'FILEID': id})
    #                         mets_structmap_schema_div.append(fptr)
    #                     elif filename == 'earkweb.log':
    #                         # earkweb log file - currently treated as digiprovMD
    #                         mets_digiprovmd = M.digiprovMD({"ID": "ID" + uuid.uuid4().__str__()})
    #                         mets_amdSec.append(mets_digiprovmd)
    #                         checksum = self.sha256(os.path.join(directory,filename))
    #                         id = "ID" + uuid.uuid4().__str__()
    #                         mets_mdref = M.mdRef({"LOCTYPE":"URL",
    #                                              "MIMETYPE":"text/xml",
    #                                              "CREATED":current_timestamp(),
    #                                              q(XLINK_NS,"type"):"simple",
    #                                              q(XLINK_NS,"href"):rel_path_file,
    #                                              "CHECKSUMTYPE":"SHA-256",
    #                                              "CHECKSUM":checksum,
    #                                              "ID": id,
    #                                               "MDTYPE": "OTHER"})
    #                         mets_digiprovmd.append(mets_mdref)
    #                         fptr = M.fptr({"FILEID": id})
    #                         mets_structmap_metadata_div.append(fptr)
    #                     elif directory.endswith('descriptive'):
    #                         # descriptive metadata
    #                         #mets_dmd = M.dmdSec({"ID": "ID" + uuid.uuid4().__str__()})
    #                         #root.append(mets_dmd)
    #                         checksum = self.sha256(os.path.join(directory,filename))
    #                         id = "ID" + uuid.uuid4().__str__()
    #                         mets_mdref = M.mdRef({"LOCTYPE":"URL",
    #                                              "MIMETYPE":"text/xml",
    #                                              "CREATED":current_timestamp(),
    #                                              q(XLINK_NS,"type"):"simple",
    #                                              q(XLINK_NS,"href"):rel_path_file,
    #                                              "CHECKSUMTYPE":"SHA-256",
    #                                              "CHECKSUM":checksum,
    #                                              "ID": id,
    #                                               "MDTYPE": "OTHER"})
    #                         mets_dmd.append(mets_mdref)
    #                         fptr = M.fptr({"FILEID": id})
    #                         mets_structmap_metadata_div.append(fptr)
    #                     elif directory.endswith('preservation'):
    #                         # preservation metadata (premis, techMD?)
    #                         #mets_techmd = M.techMD({"ID": id})
    #                         #mets_amdSec.append(mets_techmd)
    #                         mets_digiprovmd = M.digiprovMD({"ID": "ID" + uuid.uuid4().__str__()})
    #                         mets_amdSec.append(mets_digiprovmd)
    #                         checksum = self.sha256(os.path.join(directory,filename))
    #                         id = "ID" + uuid.uuid4().__str__()
    #                         mets_mdref = M.mdRef({"LOCTYPE":"URL",
    #                                              "MIMETYPE":"text/xml",
    #                                              "CREATED":current_timestamp(),
    #                                              q(XLINK_NS,"type"):"simple",
    #                                              q(XLINK_NS,"href"):rel_path_file,
    #                                              "CHECKSUMTYPE":"SHA-256",
    #                                              "CHECKSUM":checksum,
    #                                              "ID": id,
    #                                               "MDTYPE": "OTHER"})
    #                         #mets_techmd.append(mets_mdref)
    #                         mets_digiprovmd.append(mets_mdref)
    #                         fptr = M.fptr({"FILEID": id})
    #                         mets_structmap_metadata_div.append(fptr)
    #                     elif filename and not (directory.endswith('descriptive') or
    #                                            directory.endswith('metadata') or
    #                                            directory.endswith('preservation')):
    #                         # print 'found a file: ' + os.path.join(directory, filename)
    #                         id = self.addFile(os.path.join(directory, filename), mets_filegroup)
    #                         fptr = M.fptr({"FILEID": id})
    #                         mets_structmap_content_div.append(fptr)
    #
    #
    #     str = etree.tostring(root, encoding='UTF-8', pretty_print=True, xml_declaration=True)
    #
    #     path_mets = os.path.join(self.root_path,'METS.xml')
    #     with open(path_mets, 'w') as output_file:
    #         output_file.write(str)

    def createDeliveryMets(self, input_archive, output_mets):
        #create delivery METS skeleton
        METS_ATTRIBUTES = {
            "OBJID": "UUID:" + uuid.uuid4().__str__(),
            "TYPE": "SIP",
            "LABEL": "Delivery METS",
            "PROFILE": "http://webb.eark/package/METS/IP_CS.xml",
            "ID": "ID" + uuid.uuid4().__str__()
        }
        root = M.mets(METS_ATTRIBUTES)
        root.attrib['{%s}schemaLocation' %
                    XSI_NS] = "http://www.loc.gov/METS/ schemas/IP.xsd"

        mets_hdr = M.metsHdr({"CREATEDATE": current_timestamp()})
        root.append(mets_hdr)

        mets_hdr.append(
            self.createAgent("ARCHIVIST", "ORGANIZATION", "", "Institution",
                             "Note"))
        mets_hdr.append(
            self.createAgent("CREATOR", "ORGANIZATION", "", "Institution",
                             "Note"))
        mets_hdr.append(
            self.createAgent("CREATOR", "OTHER", "SOFTWARE",
                             "E-ARK SIP Creator", "VERSION=0.0.1"))
        mets_hdr.append(
            self.createAgent("PRESERVATION", "ORGANIZATION", "", "Institution",
                             "Note"))
        _, fname = os.path.split(output_mets)
        mets_hdr.append(M.metsDocumentID(fname))

        mets_fileSec = M.fileSec()
        root.append(mets_fileSec)

        mets_filegroup = M.fileGrp({
            "USE": "PACKAGES",
            "ID": "ID" + uuid.uuid4().__str__()
        })
        mets_fileSec.append(mets_filegroup)

        content_id = self.addFile(input_archive, mets_filegroup)

        mets_structmap = M.structMap({
            "ID": "ID%s" % uuid.uuid4(),
            "TYPE": "physical",
            "LABEL": "Profilestructmap"
        })
        root.append(mets_structmap)
        mets_structmap_div = M.div({"LABEL": "Package"})
        mets_structmap.append(mets_structmap_div)
        mets_structmap_content_div = M.div({"LABEL": "Content"})
        mets_structmap_div.append(mets_structmap_content_div)
        fptr = M.fptr({"FILEID": "ID%s" % uuid.uuid4()})
        mets_structmap_content_div.append(fptr)

        str = etree.tostring(root,
                             encoding='UTF-8',
                             pretty_print=True,
                             xml_declaration=True)
        with open(output_mets, 'w') as output_file:
            output_file.write(str)
Exemplo n.º 3
0
class MetsGenerator(object):
    '''
    This class generates a Mets file.
    It has to be instantiated (something = MetsGenerator(path) with the (A/S)IP root path as an argument (to specify
    the Mets directory; all subfolders will be treated as part of the IP. After this. the createMets can be called
    (something.createMets(data)) with a dictionary that must contain 'packageid', 'schemas' (location of the schema
    folder) and 'type', where 'type' must comply with the Mets standard for TYPE attribute of the Mets root.
    '''

    fid = FormatIdentification()
    mime = MimeTypes()
    root_path = ""
    mets_data = None

    def __init__(self, root_path):
        print "Working in rootdir %s" % root_path
        self.root_path = root_path

    def runCommand(self, program, stdin=PIPE, stdout=PIPE, stderr=PIPE):
        result, res_stdout, res_stderr = None, None, None
        try:
            # quote the executable otherwise we run into troubles
            # when the path contains spaces and additional arguments
            # are presented as well.
            # special: invoking bash as login shell here with
            # an unquoted command does not execute /etc/profile

            print 'Launching: ' + ' '.join(program)
            process = Popen(program,
                            stdin=stdin,
                            stdout=stdout,
                            stderr=stderr,
                            shell=False)

            res_stdout, res_stderr = process.communicate()
            result = process.returncode
            print 'Finished: ' + ' '.join(program)

        except Exception as ex:
            res_stderr = ''.join(str(ex.args))
            result = 1

        if result != 0:
            print 'Command failed:' + ''.join(res_stderr)
            raise Exception('Command failed:' + ''.join(res_stderr))

        return result, res_stdout, res_stderr

    def createAgent(self, role, type, other_type, name, note):
        if other_type:
            agent = M.agent(
                {
                    "ROLE": role,
                    "TYPE": type,
                    "OTHERTYPE": other_type
                }, M.name(name), M.note(note))
        else:
            agent = M.agent({
                "ROLE": role,
                "TYPE": type
            }, M.name(name), M.note(note))
        return agent

    def addFile(self, file_name, mets_filegroup):
        # reload(sys)
        # sys.setdefaultencoding('utf8')
        file_url = "file://./%s" % os.path.relpath(file_name, self.root_path)
        file_mimetype, _ = self.mime.guess_type(file_url)
        file_checksum = get_sha256_hash(file_name)
        file_size = os.path.getsize(file_name)
        file_cdate = get_file_ctime_iso_date_str(file_name,
                                                 DT_ISO_FMT_SEC_PREC)
        file_id = "ID" + uuid.uuid4().__str__()
        mets_file = M.file({
            "MIMETYPE": file_mimetype,
            "CHECKSUMTYPE": "SHA-256",
            "CREATED": file_cdate,
            "CHECKSUM": file_checksum,
            "USE": "Datafile",
            "ID": file_id,
            "SIZE": file_size
        })
        mets_filegroup.append(mets_file)
        # _,fname = os.path.split(file_name)
        mets_FLocat = M.FLocat({
            q(XLINK_NS, 'href'): file_url,
            "LOCTYPE": "URL",
            q(XLINK_NS, 'type'): 'simple'
        })
        mets_file.append(mets_FLocat)
        return file_id

    def addFiles(self, folder, mets_filegroup):
        ids = []
        for top, dirs, files in os.walk(folder):
            for fn in files:
                file_name = os.path.join(top, fn)
                file_id = self.addFile(file_name, mets_filegroup)
                ids.append(file_id)
        return ids

    def make_mdref(self, path, file, id, mdtype):
        mimetype, _ = self.mime.guess_type(os.path.join(path, file))
        rel_path = "file://./%s" % os.path.relpath(os.path.join(path, file),
                                                   self.root_path)
        mets_mdref = {
            "LOCTYPE": "URL",
            "MIMETYPE": mimetype,
            "CREATED": current_timestamp(),
            q(XLINK_NS, "type"): "simple",
            q(XLINK_NS, "href"): rel_path,
            "CHECKSUMTYPE": "SHA-256",
            "CHECKSUM": get_sha256_hash(os.path.join(path, file)),
            "ID": id,
            "MDTYPE": mdtype
        }
        return mets_mdref

    def setParentRelation(self, identifier):
        parentmets = os.path.join(self.root_path, 'METS.xml')
        packagetype = self.mets_data['type']
        if os.path.exists(parentmets):
            parser = etree.XMLParser(resolve_entities=False,
                                     remove_blank_text=True,
                                     strip_cdata=False)
            parent_parse = etree.parse(parentmets, parser)
            parent_root = parent_parse.getroot()

            parent = M.div({'LABEL': "parent %s" % packagetype})
            pointer = M.mptr({
                "LOCTYPE":
                "OTHER",
                "OTHERLOCTYPE":
                "UUID",
                q(XLINK_NS, "title"):
                ("Referencing a parent %s." % packagetype),
                q(XLINK_NS, "href"):
                "urn:uuid:" + identifier,
                "ID":
                "ID" + uuid.uuid4().__str__()
            })
            parent.append(pointer)

            parent_map = parent_root.find(
                "%s[@LABEL='parent %s']" %
                (q(METS_NS, 'structMap'), packagetype))
            if parent_map is not None:
                parent_div = parent_map.find(
                    "%s[@LABEL='parent %s identifiers']" %
                    (q(METS_NS, 'div'), packagetype))
                parent_div.append(parent)
            else:
                parent_map = M.structMap({
                    'LABEL': 'parent %s' % packagetype,
                    'TYPE': 'logical'
                })
                parent_div = M.div(
                    {'LABEL': 'parent %s identifiers' % packagetype})
                parent_map.append(parent_div)
                parent_div.append(parent)
                parent_root.insert(len(parent_root), parent_map)

            str = etree.tostring(parent_root,
                                 encoding='UTF-8',
                                 pretty_print=True,
                                 xml_declaration=True)
            with open(parentmets, 'w') as output_file:
                output_file.write(str)
        else:
            print 'Couldn\'t find the parent %ss Mets file.' % packagetype

    def addChildRelation(self, identifier):
        parentmets = os.path.join(self.root_path, 'METS.xml')
        packagetype = self.mets_data['type']
        if os.path.exists(parentmets):
            parser = etree.XMLParser(resolve_entities=False,
                                     remove_blank_text=True,
                                     strip_cdata=False)
            parent_parse = etree.parse(parentmets, parser)
            parent_root = parent_parse.getroot()
            child = M.div({'LABEL': "child %s" % packagetype})
            pointer = M.mptr({
                "LOCTYPE":
                "OTHER",
                "OTHERLOCTYPE":
                "UUID",
                q(XLINK_NS, "title"):
                ("Referencing a child %s." % packagetype),
                q(XLINK_NS, "href"):
                "urn:uuid:" + identifier,
                "ID":
                "ID" + uuid.uuid4().__str__()
            })
            child.append(pointer)

            children_map = parent_root.find(
                "%s[@LABEL='child %s']" %
                (q(METS_NS, 'structMap'), packagetype))
            if children_map is not None:
                children_div = children_map.find(
                    "%s[@LABEL='child %s identifiers']" %
                    (q(METS_NS, 'div'), packagetype))
                children_div.append(child)
            else:
                children_map = M.structMap({
                    'LABEL': 'child %s' % packagetype,
                    'TYPE': 'logical'
                })
                children_div = M.div(
                    {'LABEL': 'child %s identifiers' % packagetype})
                children_map.append(children_div)
                children_div.append(child)
                parent_root.insert(len(parent_root), children_map)

            str = etree.tostring(parent_root,
                                 encoding='UTF-8',
                                 pretty_print=True,
                                 xml_declaration=True)
            with open(parentmets, 'w') as output_file:
                output_file.write(str)
        else:
            print 'Couldn\'t find the parent %ss Mets file.' % packagetype

    def createMets(self, mets_data):
        self.mets_data = mets_data
        packageid = mets_data['packageid']
        packagetype = mets_data['type']
        schemafolder = mets_data['schemas']
        parent = mets_data['parent']

        print 'creating Mets'
        ###########################
        # create METS skeleton
        ###########################

        # create Mets root
        METS_ATTRIBUTES = {
            "OBJID": "urn:uuid:" + packageid,
            "LABEL":
            "METS file describing the %s matching the OBJID." % packagetype,
            "PROFILE": "http://www.ra.ee/METS/v01/IP.xml",
            "TYPE": packagetype
        }
        root = M.mets(METS_ATTRIBUTES)

        if os.path.isfile(os.path.join(schemafolder, 'mets_1_11.xsd')):
            mets_schema_location = os.path.relpath(
                os.path.join(schemafolder, 'mets_1_11.xsd'), self.root_path)
        else:
            mets_schema_location = 'empty'
        if os.path.isfile(os.path.join(schemafolder, 'xlink.xsd')):
            xlink_schema_loaction = os.path.relpath(
                os.path.join(schemafolder, 'xlink.xsd'), self.root_path)
        else:
            xlink_schema_loaction = 'empty'

        root.attrib[
            '{%s}schemaLocation' %
            XSI_NS] = "http://www.loc.gov/METS/ %s http://www.w3.org/1999/xlink %s" % (
                mets_schema_location, xlink_schema_loaction)

        # create Mets header
        mets_hdr = M.metsHdr({
            "CREATEDATE": current_timestamp(),
            "RECORDSTATUS": "NEW"
        })
        root.append(mets_hdr)

        # add an agent
        mets_hdr.append(
            self.createAgent("CREATOR", "OTHER", "SOFTWARE", "E-ARK earkweb",
                             "VERSION=0.0.1"))

        # add document ID
        mets_hdr.append(M.metsDocumentID("METS.xml"))

        # create amdSec
        mets_amdSec = M.amdSec({"ID": "ID" + uuid.uuid4().__str__()})
        root.append(mets_amdSec)

        # create fileSec
        mets_fileSec = M.fileSec()
        root.append(mets_fileSec)

        # general filegroup
        mets_filegroup = M.fileGrp({
            "ID": "ID" + uuid.uuid4().__str__(),
            "USE": "general filegroup"
        })
        mets_fileSec.append(mets_filegroup)

        # structMap 'E-ARK structural map' - default, physical structure
        mets_earkstructmap = M.structMap({
            "LABEL": "E-ARK structural map",
            "TYPE": "physical"
        })
        root.append(mets_earkstructmap)
        package_div = M.div({"LABEL": packageid})
        # append physical structMap
        mets_earkstructmap.append(package_div)

        # structMap and div for the whole package (metadata, schema and /data)
        mets_structmap = M.structMap({
            "LABEL": "Simple %s structuring" % packagetype,
            "TYPE": "logical"
        })
        root.append(mets_structmap)
        mets_structmap_div = M.div({"LABEL": "Package structure"})
        mets_structmap.append(mets_structmap_div)

        # metadata structmap - IP root level!
        mets_structmap_metadata_div = M.div({"LABEL": "metadata files"})
        mets_structmap_div.append(mets_structmap_metadata_div)

        # structmap for schema files
        mets_structmap_schema_div = M.div({"LABEL": "schema files"})
        mets_structmap_div.append(mets_structmap_schema_div)

        # content structmap - all representations! (is only filled if no separate METS exists for the rep)
        mets_structmap_content_div = M.div({"LABEL": "content files"})
        mets_structmap_div.append(mets_structmap_content_div)

        # create structmap and div for Mets files from representations
        # mets_structmap_reps = M.structMap({"TYPE": "logical", "LABEL": "representations"})
        # root.append(mets_structmap_reps)
        # mets_div_reps = M.div({"LABEL": "representations", "TYPE": "type"})
        # mets_structmap_reps.append(mets_div_reps)

        # create structmap for parent/child relation, if applicable
        if parent != '':
            print 'creating link to parent %s' % packagetype
            mets_structmap_relation = M.structMap({
                'TYPE': 'logical',
                'LABEL': 'parent'
            })
            root.append(mets_structmap_relation)
            mets_div_rel = M.div(
                {'LABEL': '%s parent identifier' % packagetype})
            mets_structmap_relation.append(mets_div_rel)
            parent_pointer = M.mptr({
                "LOCTYPE":
                "OTHER",
                "OTHERLOCTYPE":
                "UUID",
                q(XLINK_NS, "title"):
                ("Referencing the parent %s of this (urn:uuid:%s) %s." %
                 (packagetype, packageid, packagetype)),
                q(XLINK_NS, "href"):
                "urn:uuid:" + parent,
                "ID":
                "ID" + uuid.uuid4().__str__()
            })
            mets_div_rel.append(parent_pointer)

        ###########################
        # add to Mets skeleton
        ###########################

        # add the package content to the Mets skeleton
        for directory, subdirectories, filenames in os.walk(self.root_path):
            # build the earkstructmap
            path = os.path.relpath(directory, self.root_path)
            physical_div = ''
            if path != '.':
                physical_div = M.div({"LABEL": path})
                package_div.append(physical_div)
            # if directory.endswith('metadata/earkweb'):
            #     # Ignore temp files only needed for IP processing with earkweb
            #     del filenames[:]
            #     del subdirectories[:]
            if directory.endswith('submission/metadata') or directory.endswith(
                    'submission/schemas'):
                del filenames[:]
                del subdirectories[:]
            if directory == os.path.join(self.root_path, 'metadata'):
                # Metadata on IP root level - if there are folders for representation-specific metadata,
                # check if the corresponding representation has a Mets file. If yes, skip; if no, add to IP root Mets.
                for filename in filenames:
                    if filename == 'earkweb.log':
                        mets_digiprovmd = M.digiprovMD(
                            {"ID": "ID" + uuid.uuid4().__str__()})
                        mets_amdSec.append(mets_digiprovmd)
                        id = "ID" + uuid.uuid4().__str__()
                        ref = self.make_mdref(directory, filename, id, 'OTHER')
                        mets_mdref = M.mdRef(ref)
                        mets_digiprovmd.append(mets_mdref)
                        mets_structmap_metadata_div.append(
                            M.fptr({"FILEID": id}))
                        physical_div.append(M.fptr({"FILEID": id}))
                del subdirectories[:]  # prevent loop to iterate subfolders outside of this if statement
                dirlist = os.listdir(os.path.join(self.root_path, 'metadata'))
                for dirname in dirlist:
                    if fnmatch.fnmatch(dirname, '*_mig-*'):
                        # TODO: maybe list it all the time?
                        # this folder contains metadata for a representation/migration, currently:
                        # only listed if no representation Mets file exists
                        if os.path.isfile(
                                os.path.join(self.root_path,
                                             'representations/%s/METS.xml') %
                                dirname):
                            pass
                        else:
                            for dir, subdir, files in os.walk(
                                    os.path.join(self.root_path, 'metadata/%s')
                                    % dirname):
                                for filename in files:
                                    if dir.endswith('descriptive'):
                                        mets_dmd = M.dmdSec({
                                            "ID":
                                            "ID" + uuid.uuid4().__str__()
                                        })
                                        root.insert(1, mets_dmd)
                                        id = "ID" + uuid.uuid4().__str__()
                                        ref = self.make_mdref(
                                            dir, filename, id, 'OTHER')
                                        mets_mdref = M.mdRef(ref)
                                        mets_dmd.append(mets_mdref)
                                        mets_structmap_metadata_div.append(
                                            M.fptr({"FILEID": id}))
                                        physical_div.append(
                                            M.fptr({"FILEID": id}))
                                    elif dir.endswith('preservation'):
                                        mets_digiprovmd = M.digiprovMD({
                                            "ID":
                                            "ID" + uuid.uuid4().__str__()
                                        })
                                        mets_amdSec.append(mets_digiprovmd)
                                        id = "ID" + uuid.uuid4().__str__()
                                        mdtype = ''
                                        if filename.startswith(
                                                'premis') or filename.endswith(
                                                    'premis.xml'):
                                            mdtype = 'PREMIS'
                                        else:
                                            mdtype = 'OTHER'
                                        ref = self.make_mdref(
                                            dir, filename, id, mdtype)
                                        mets_mdref = M.mdRef(ref)
                                        mets_digiprovmd.append(mets_mdref)
                                        mets_structmap_metadata_div.append(
                                            M.fptr({"FILEID": id}))
                                        physical_div.append(
                                            M.fptr({"FILEID": id}))
                                    elif filename:
                                        print 'Unclassified metadata file %s in %s.' % (
                                            filename, dir)
                    else:
                        # metadata that should be listed in the Mets
                        for dir, subdir, files in os.walk(
                                os.path.join(self.root_path, 'metadata/%s') %
                                dirname):
                            if len(files) > 0:
                                for filename in files:
                                    #if dir.endswith('descriptive'):
                                    if dirname == 'descriptive':
                                        mets_dmd = M.dmdSec({
                                            "ID":
                                            "ID" + uuid.uuid4().__str__()
                                        })
                                        root.insert(1, mets_dmd)
                                        id = "ID" + uuid.uuid4().__str__()
                                        # TODO: change MDTYPE
                                        ref = self.make_mdref(
                                            dir, filename, id, 'OTHER')
                                        mets_mdref = M.mdRef(ref)
                                        mets_dmd.append(mets_mdref)
                                        mets_structmap_metadata_div.append(
                                            M.fptr({"FILEID": id}))
                                        physical_div.append(
                                            M.fptr({"FILEID": id}))
                                    #elif dir.endswith('preservation'):
                                    elif dirname == 'preservation' or dirname == 'earkweb':
                                        mets_digiprovmd = M.digiprovMD({
                                            "ID":
                                            "ID" + uuid.uuid4().__str__()
                                        })
                                        mets_amdSec.append(mets_digiprovmd)
                                        id = "ID" + uuid.uuid4().__str__()
                                        mdtype = ''
                                        if filename.startswith(
                                                'premis') or filename.endswith(
                                                    'premis.xml'):
                                            mdtype = 'PREMIS'
                                        elif filename:
                                            mdtype = 'OTHER'
                                        ref = self.make_mdref(
                                            dir, filename, id, mdtype)
                                        mets_mdref = M.mdRef(ref)
                                        mets_digiprovmd.append(mets_mdref)
                                        mets_structmap_metadata_div.append(
                                            M.fptr({"FILEID": id}))
                                        physical_div.append(
                                            M.fptr({"FILEID": id}))
                                    elif filename:
                                        print 'Unclassified metadata file %s in %s.' % (
                                            filename, dir)
            else:
                # Any other folder outside of /<root>/metadata
                for filename in filenames:
                    if directory == self.root_path:
                        # ignore files on IP root level
                        del filename
                    else:
                        # TODO: list rep metadata only in the rep Mets?
                        rel_path_file = "file://./%s" % os.path.relpath(
                            os.path.join(directory, filename), self.root_path)
                        if filename.lower() == 'mets.xml':
                            # delete the subdirectories list to stop os.walk from traversing further;
                            # mets file should be added as <mets:mptr> to <structMap> for corresponding rep
                            del subdirectories[:]
                            rep_name = directory.rsplit('/', 1)[1]
                            # create structMap div and append to representations structMap
                            # mets_structmap_rep_div = M.div({"LABEL": rep_name, "TYPE": "representation mets", "ID": "ID" + uuid.uuid4().__str__()})
                            # mets_div_reps.append(mets_structmap_rep_div)
                            # add mets file as <mets:mptr>
                            metspointer = M.mptr({
                                "LOCTYPE":
                                "URL",
                                q(XLINK_NS, "title"):
                                ("Mets file describing representation: %s of %s: urn:uuid:%s."
                                 % (rep_name, packagetype, packageid)),
                                q(XLINK_NS, "href"):
                                rel_path_file,
                                "ID":
                                "ID" + uuid.uuid4().__str__()
                            })
                            #mets_structmap_rep_div.append(metspointer)
                            #mets_structmap_rep_div.append(M.fptr({"FILEID": id}))
                            physical_div.append(
                                metspointer
                            )  # IMPORTANT: The <mptr> element needs to be the first entry in a <div>, or the Mets will be invalid!
                            # also create a <fptr> for the Mets file
                            id = self.addFile(
                                os.path.join(directory, filename),
                                mets_filegroup)
                            physical_div.append(M.fptr({"FILEID": id}))
                        elif filename and directory.endswith('schemas'):
                            # schema files
                            id = self.addFile(
                                os.path.join(directory, filename),
                                mets_filegroup)
                            mets_structmap_schema_div.append(
                                M.fptr({'FILEID': id}))
                            physical_div.append(M.fptr({'FILEID': id}))
                        elif filename:
                            id = self.addFile(
                                os.path.join(directory, filename),
                                mets_filegroup)
                            mets_structmap_content_div.append(
                                M.fptr({'FILEID': id}))
                            physical_div.append(M.fptr({'FILEID': id}))

        str = etree.tostring(root,
                             encoding='UTF-8',
                             pretty_print=True,
                             xml_declaration=True)

        path_mets = os.path.join(self.root_path, 'METS.xml')
        with open(path_mets, 'w') as output_file:
            output_file.write(str)
Exemplo n.º 4
0
class PremisGenerator(object):
    fid = FormatIdentification()
    mime = MimeTypes()
    root_path = ""

    def __init__(self, root_path):
        print "Working in rootdir %s" % root_path
        self.root_path = root_path

    def sha256(self, fname):
        hash = hashlib.sha256()
        with open(fname) as f:
            for chunk in iter(lambda: f.read(4096), ""):
                hash.update(chunk)
        return hash.hexdigest()

    def runCommand(self, program, stdin=PIPE, stdout=PIPE, stderr=PIPE):
        result, res_stdout, res_stderr = None, None, None
        try:
            # quote the executable otherwise we run into troubles
            # when the path contains spaces and additional arguments
            # are presented as well.
            # special: invoking bash as login shell here with
            # an unquoted command does not execute /etc/profile

            print 'Launching: ' + ' '.join(program)
            process = Popen(program,
                            stdin=stdin,
                            stdout=stdout,
                            stderr=stderr,
                            shell=False)

            res_stdout, res_stderr = process.communicate()
            result = process.returncode
            print 'Finished: ' + ' '.join(program)

        except Exception as ex:
            res_stderr = ''.join(str(ex.args))
            result = 1

        if result != 0:
            print 'Command failed:' + ''.join(res_stderr)
            raise Exception('Command failed:' + ''.join(res_stderr))

        return result, res_stdout, res_stderr

    def addObject(self, abs_path):
        '''
        Must be called with the absolute path to a file.

        @param abs_path:    absolute file path
        @return:            Premis object
        '''

        hash = self.sha256(abs_path)
        file_url = "file://./%s" % os.path.relpath(abs_path, self.root_path)
        fmt = self.fid.identify_file(abs_path)
        size = os.path.getsize(abs_path)
        premis_id = 'ID' + uuid.uuid4().__str__()

        # create a Premis object
        object = P.object(
            {
                q(XSI_NS, 'type'): 'file',
                "xmlID": premis_id
            },
            P.objectIdentifier(P.objectIdentifierType('filepath'),
                               P.objectIdentifierValue(file_url)),
            P.objectCharacteristics(
                P.compositionLevel(0),
                P.fixity(P.messageDigestAlgorithm("SHA-256"),
                         P.messageDigest(hash),
                         P.messageDigestOriginator("hashlib")),
                P.size(size),
                P.format(
                    P.formatRegistry(P.formatRegistryName("PRONOM"),
                                     P.formatRegistryKey(fmt),
                                     P.formatRegistryRole("identification"))),
            ),
        )
        return object

    def addEvent(self, premispath, info):
        '''
        Add an event to an exisiting Premis file (DefaultTask finalize method).

        @param premispath:
        @param info:
        @return:
        '''
        # print type(premispath)

        outcome = info['outcome']
        agent = info['task_name']
        event_type = info['event_type']
        linked_object = info['linked_object']

        premis_path = os.path.join(self.root_path, premispath)
        premis_parsed = etree.parse(premis_path)
        premis_root = premis_parsed.getroot()

        event_id = 'ID' + uuid.uuid4().__str__()
        event = P.event(
            P.eventIdentifier(P.eventIdentifierType('local'),
                              P.eventIdentifierValue(event_id)),
            P.eventType(event_type), P.eventDateTime(current_timestamp()),
            P.eventOutcomeInformation(P.eventOutcome(outcome)),
            P.linkingAgentIdentifier(P.linkingAgentIdentifierType('software'),
                                     P.linkingAgentIdentifierValue(agent)),
            P.linkingObjectIdentifier(
                P.linkingObjectIdentifierType('repository'),
                P.linkingObjectIdentifierValue(linked_object)))
        premis_root.insert(len(premis_root) - 1, event)

        str = etree.tostring(premis_root,
                             encoding='UTF-8',
                             pretty_print=True,
                             xml_declaration=True)
        with open(premis_path, 'w') as output_file:
            output_file.write(str)

        return

    def createMigrationPremis(self, premis_info):
        PREMIS_ATTRIBUTES = {"version": "2.0"}
        premis = P.premis(PREMIS_ATTRIBUTES)
        premis.attrib[
            '{%s}schemaLocation' %
            XSI_NS] = "info:lc/xmlns/premis-v2 ../../schemas/premis-v2-2.xsd"

        # creates an object that references the package or representation
        # TODO: identifier!
        premis_id = 'ID' + uuid.uuid4().__str__()
        object = P.object(
            {
                q(XSI_NS, 'type'): 'representation',
                "xmlID": premis_id
            },
            P.objectIdentifier(
                P.objectIdentifierType('repository'),
                P.objectIdentifierValue('package-id-goes-here-?')),
        )
        premis.append(object)

        # parse the migration.xml, add events and objects
        migrations = etree.iterparse(open(premis_info['info']),
                                     events=('start', ))
        eventlist = []
        for event, element in migrations:
            if element.tag == 'migration':
                event_id = 'ID' + uuid.uuid4().__str__()
                if self.root_path.endswith(element.attrib['targetrep']):
                    source_object_abs = os.path.join(
                        element.attrib['sourcedir'], element.attrib['file'])
                    source_object_rel = "file://./%s" % os.path.relpath(
                        source_object_abs, self.root_path)
                    target_object_abs = os.path.join(
                        element.attrib['targetdir'], element.attrib['output'])
                    target_object_rel = "file://./%s" % os.path.relpath(
                        target_object_abs, self.root_path)

                    # event
                    event = P.event(
                        P.eventIdentifier(P.eventIdentifierType('local'),
                                          P.eventIdentifierValue(event_id)),
                        P.eventType('migration'),
                        P.eventDateTime(
                            element.attrib['starttime']
                        ),  # TODO: use event start or event end time?
                        P.eventOutcomeInformation(P.eventOutcome('success')),
                        P.linkingAgentIdentifier(
                            P.linkingAgentIdentifierType('software'),
                            P.linkingAgentIdentifierValue(
                                'should probably come from migrations.xml')),
                        P.linkingObjectIdentifier(
                            P.linkingObjectIdentifierType('filepath'),
                            P.linkingObjectIdentifierValue(target_object_rel)))
                    eventlist.append(event)

                    # object
                    object = self.addObject(target_object_abs)
                    # add the relationship to the migration event and the source file
                    relationship = P.relationship(
                        P.relationshipType('derivation'),
                        P.relationshipSubType('has source'),
                        P.relatedObjectIdentification(
                            P.relatedObjectIdentifierType('filepath'),
                            P.relatedObjectIdentifierValue(source_object_rel),
                            P.relatedObjectSequence('0')),
                        P.relatedEventIdentification(
                            P.relatedEventIdentifierType('local'),
                            P.relatedEventIdentifierValue(event_id),
                            P.relatedEventSequence('1')),
                    )
                    object.append(relationship)

                    premis.append(object)
                else:
                    pass
            else:
                pass

        # append all events to premis root - they must be below the objects (due to validation)
        for event in eventlist:
            premis.append(event)

        # add agent
        identifier_value = 'earkweb'
        premis.append(
            P.agent(
                P.agentIdentifier(P.agentIdentifierType('LOCAL'),
                                  P.agentIdentifierValue(identifier_value)),
                P.agentName('E-ARK AIP to DIP Converter'),
                P.agentType('Software')))

        # create the Premis file
        str = etree.tostring(premis,
                             encoding='UTF-8',
                             pretty_print=True,
                             xml_declaration=True)
        preservation_dir = os.path.join(self.root_path,
                                        'metadata/preservation')
        if not os.path.exists(preservation_dir):
            os.makedirs(preservation_dir)
        path_premis = os.path.join(self.root_path,
                                   'metadata/preservation/premis.xml')
        with open(path_premis, 'w') as output_file:
            output_file.write(str)

        return

    def createPremis(self):
        PREMIS_ATTRIBUTES = {"version": "2.0"}
        premis = P.premis(PREMIS_ATTRIBUTES)
        premis.attrib[
            '{%s}schemaLocation' %
            XSI_NS] = "info:lc/xmlns/premis-v2 ../../schemas/premis-v2-2.xsd"

        # if there are no /data files, this will ensure that there is at least one object (the IP itself)
        premis_id = 'ID' + uuid.uuid4().__str__()
        object = P.object(
            {
                q(XSI_NS, 'type'): 'representation',
                "xmlID": premis_id
            },
            P.objectIdentifier(
                P.objectIdentifierType('repository'),
                P.objectIdentifierValue('package-id-goes-here-?')),
        )
        premis.append(object)

        # create premis objects for files in this representation (self.root_path/data)
        for directory, subdirectories, filenames in os.walk(
                os.path.join(self.root_path, 'data')):
            for filename in filenames:
                object = self.addObject(os.path.join(directory, filename))
                premis.append(object)

        # # event
        # identifier_value = 'AIP Creation'
        # linking_agent = 'earkweb'
        # linking_object=None
        # premis.append(P.event(
        #         P.eventIdentifier(
        #             P.eventIdentifierType('local'),
        #             P.eventIdentifierValue(identifier_value)
        #         ),
        #         P.eventType,
        #         P.eventDateTime(current_timestamp()),
        #         P.linkingAgentIdentifier(
        #             P.linkingAgentIdentifierType('local'),
        #             P.linkingAgentIdentifierValue(linking_agent)
        #         ),
        #
        #         P.linkingAgentIdentifier(
        #             P.linkingAgentIdentifierType('local'),
        #             P.linkingAgentIdentifierValue(linking_object)
        #         )
        #         if linking_object is not None else None
        #     ))

        # add agent
        identifier_value = 'earkweb'
        premis.append(
            P.agent(
                P.agentIdentifier(P.agentIdentifierType('LOCAL'),
                                  P.agentIdentifierValue(identifier_value)),
                P.agentName('E-ARK AIP to DIP Converter'),
                P.agentType('Software')))

        str = etree.tostring(premis,
                             encoding='UTF-8',
                             pretty_print=True,
                             xml_declaration=True)
        preservation_dir = os.path.join(self.root_path,
                                        './metadata/preservation')
        if not os.path.exists(preservation_dir):
            os.mkdir(preservation_dir)
        path_premis = os.path.join(self.root_path,
                                   './metadata/preservation/premis.xml')
        with open(path_premis, 'w') as output_file:
            output_file.write(str)

        return
Exemplo n.º 5
0
class SolrClient(object):

    ffid = None

    def __init__(self, solr_server, collection):
        """
        Constructor to initialise solr client API URL

        @type       solr_server: SolrServer
        @param      solr_server: Solr server

        @type       collection: string
        @param      collection: Collection identifier, e.g. "samplecollection"
        """
        base_url = solr_server.get_base_url()
        if base_url[-1] != '/':
            base_url += '/'
        self.url = base_url + collection
        self.ffid = FormatIdentification()

    def select_params_suffix(self, params_suffix, rows=1000, start=0):
        """
        Search Solr, return URL and JSON response

        @type       params: string
        @param      params: Parameter suffix

        @rtype: string, int
        @return: Return url and return code
        """
        url = self.url + '/select?q=%s&rows=%d&start=%d&wt=json' % (urllib.quote(params_suffix), rows, start)
        conn = urllib2.urlopen(url)
        return url, json.load(conn)

    def select(self, params):
        """
        Search Solr, return URL and JSON response

        @type       params: string
        @param      params: Query parameters

        @rtype: string, int
        @return: Return url and return code
        """
        params['wt'] = 'json'
        url = self.url + '/select?' + urllib.urlencode(params)
        conn = urllib2.urlopen(url)
        return url, json.load(conn)

    def delete(self, query):
        """
        Delete query result documents

        @type       query: string
        @param      query: query

        @rtype: string, int
        @return: Return url and return code
        """
        params = {}
        url = self.url + '/update?' + urllib.urlencode(params)
        request = urllib2.Request(url)
        request.add_header('Content-Type', 'text/xml; charset=utf-8')
        request.add_data('<delete><query>{0}</query></delete>'.format(query))
        response = urllib2.urlopen(request).read()
        status = etree.XML(response).findtext('lst/int')
        return url, status

    def update(self, docs):
        """
        Post a list of documents

        @type       docs: list
        @param      docs: List of solr documents

        @rtype: string, int
        @return: Return url and return code
        """
        url = self.url + '/update?commit=true'
        add_xml = etree.Element('add')
        for doc in docs:
            xdoc = etree.SubElement(add_xml, 'doc')
            for key, value in doc.iteritems():
                if value:
                    field = etree.Element('field', name=key)
                    field.text = (value if isinstance(value, unicode)
                                  else str(value))
                    xdoc.append(field)
        request = urllib2.Request(url)
        request.add_header('Content-Type', 'text/xml; charset=utf-8')
        request.add_data(etree.tostring(add_xml, pretty_print=True))
        response = urllib2.urlopen(request).read()
        status = etree.XML(response).findtext('lst/int')
        return url, status

    def post_file_document(self, file_path, identifier, entry):
        """
        Iterate over tar file and post documents it contains to Solr API (extract)

        @type       file_path: string
        @param      file_path: Absolute path to file

        @type       identifier: string
        @param      identifier: Identifier of the tar package

        @type       entry: string
        @param      entry: entry name
        """
        puid = self.ffid.identify_file(file_path)
        content_type = self.ffid.get_mime_for_puid(puid)
        docs = []
        document = {"package": identifier, "path": entry, "content_type": content_type}
        docs.append(document)
        _, status = self.update(docs)
        return status

    def post_tar_file(self, tar_file_path, identifier, progress_reporter=default_reporter):
        """
        Iterate over tar file and post documents it contains to Solr API (extract)

        @type       tar_file_path: string
        @param      tar_file_path: Absolute path to tar file

        @type       identifier: string
        @param      identifier: Identifier of the tar package

        @rtype: list(dict(string, int))
        @return: Return list of urls and return codes
        """
        progress_reporter(0)
        import tarfile
        tfile = tarfile.open(tar_file_path, 'r')
        extract_dir = '/tmp/temp-' + randomutils.randomword(10)
        results = []

        numfiles = sum(1 for tarinfo in tfile if tarinfo.isreg())
        logger.debug("Number of files in tarfile: %s " % numfiles)

        num = 0

        mets_entry = "%s/METS.xml" % identifier
        package_type = "IP"
        try:
            tfile.extract(mets_entry, extract_dir)
            mets_path = os.path.join(extract_dir, identifier, "METS.xml")
            if os.path.exists(mets_path):
                try:
                    mets = ParsedMets(extract_dir)
                    mets.load_mets(mets_path)
                    package_type = mets.get_package_type()
                except:
                    logger.warn("Error loading METS from package during indexing, assigning default package type instead.")

            else:
                logger.warn("METS file does not exist: %s" % mets_path)
        except KeyError:
            logger.warn("METS entry does not exist in TAR file: %s" % mets_entry)

        for t in tfile:
            tfile.extract(t, extract_dir)
            afile = os.path.join(extract_dir, t.name)

            if os.path.exists(afile):
                params = SolrDocParams(afile).get_params()
                params['literal.packagetype'] = package_type
                params['literal.package'] = identifier
                params['literal.path'] = t.name
                files = {'file': ('userfile', open(afile, 'rb'))}
                post_url = '%s/update/extract?%s' % (self.url, urllib.urlencode(params))
                response = requests.post(post_url, files=files)
                result = {"url": post_url, "status": response.status_code}
                if response.status_code != 200:
                    status = self.post_file_document(afile, identifier, t.name)
                    if status == 200:
                        logger.info("posting file failed for url '%s' with status code: %d (posted plain document instead)" % (post_url, response.status_code))
                    else:
                        logger.info("Unable to create document for url '%s'" % (post_url))
                results.append(result)
                num += 1
                percent = num * 100 / numfiles
                progress_reporter(percent)
        self.commit()
        logger.debug("Files extracted to %s" % extract_dir)
        shutil.rmtree(extract_dir)
        progress_reporter(100)
        return results

    def commit(self):
        """
        Commit changes to Solr

        @rtype: string, int
        @return: Return url and return code
        """
        url = self.url + '/update?commit=true'
        response = urllib2.urlopen(url)
        return url, response.code