示例#1
0
    def createMetadata(self, form):
        entry = sword2.Entry()
        p = self.paper
        entry.add_field('title', p.title)
        for a in p.authors:
            if a.orcid:
                entry.add_author(unicode(a),
                                 uri='http://{}/{}'.format(
                                     settings.ORCID_BASE_DOMAIN, a.orcid))
            else:
                entry.add_author(unicode(a))
        if p.abstract:
            entry.add_field('dcterms_abstract', p.abstract)
        entry.add_field('dcterms_issued', p.pubdate.isoformat())
        for pub in p.publications:
            entry.add_field('dcterms_identifier', 'doi:' + pub.doi)
            if pub.journal and pub.journal.issn:
                entry.add_field('dcterms_isPartOf', 'issn:' + pub.journal.issn)

        for rec in p.oairecords:
            entry.add_field('dcterms_source', rec.splash_url)

        entry.add_field('dcterms_type', p.doctype)

        return entry
示例#2
0
    def __init__(self, entry=None, title=None, dataverse=None, edit_uri=None,
                 edit_media_uri=None, statement_uri=None, **kwargs):

        # Generate sword entry
        sword_entry = sword2.Entry(entry)
        if not get_elements(sword_entry.pretty_print(), namespace='dcterms', tag='title'):
            # Append title to entry
            if isinstance(title, basestring):
                sword_entry.add_field(format_term('title'), title)
            else:
                raise DataverseException('Study needs a single, valid title.')
        if kwargs:
            # Updates sword entry from keyword arguments
            for k in kwargs.keys():
                if isinstance(kwargs[k], list):
                    for item in kwargs[k]:
                        sword_entry.add_field(format_term(k), item)
                else:
                    sword_entry.add_field(format_term(k), kwargs[k])

        self.entry = sword_entry.pretty_print()
        self.dataverse = dataverse

        self.edit_uri = edit_uri
        self.edit_media_uri = edit_media_uri
        self.statement_uri = statement_uri
示例#3
0
文件: dspace.py 项目: GypsyBud/pyrdm
 def replace_deposit_metadata(self, receipt, **metadata_kwargs):
     """ Replace a deposit's metadata with that defined by **metadata_kwargs.
   Return a Receipt object for this replacement action. """
     e = sword2.Entry()
     e.add_fields(**metadata_kwargs)
     replace_receipt = self.connection.update(metadata_entry=e,
                                              dr=receipt,
                                              in_progress=True)
     return replace_receipt
示例#4
0
文件: dspace.py 项目: GypsyBud/pyrdm
 def create_deposit_from_metadata(self,
                                  collection,
                                  in_progress=True,
                                  **metadata_kwargs):
     """ Create a deposit in a specified collection by providing metadata in **metadata_kwargs.
   Return a Receipt object for this transaction. """
     e = sword2.Entry()
     e.add_fields(**metadata_kwargs)
     receipt = self.connection.create(col_iri=collection.href,
                                      in_progress=in_progress,
                                      metadata_entry=e)
     return receipt
    def _delete_update_lom(self, package, delete_lom_ids):
        """
        Notifys LOM that AUs with `delete_lom_ids` will be deleted.

        Helper to update_package_status.
        """
        # Update LOM that local copies will be deleted
        entry = sword2.Entry(id="urn:uuid:{}".format(package.uuid))
        entry.register_namespace("lom", utils.NSMAP["lom"])
        for lom_id in delete_lom_ids:
            if lom_id:
                etree.SubElement(entry.entry,
                                 utils.PREFIX_NS["lom"] + "content",
                                 recrawl="false").text = lom_id
        LOGGER.debug("edit entry: %s", entry)
        # SWORD2 client doesn't handle 202 respose correctly - implementing here
        # Correct function is self.sword_connection.update_metadata_for_resource
        headers = {
            "Content-Type": "application/atom+xml;type=entry",
            "Content-Length": str(len(str(entry))),
            "On-Behalf-Of": str(self.content_provider_id),
        }
        response, content = self.sword_connection.h.request(
            uri=package.misc_attributes["edit_iri"],
            method="PUT",
            headers=headers,
            payload=str(entry),
        )

        # Return with error message if response not 200
        LOGGER.debug("response code: %s", response["status"])
        if response["status"] != 200:
            if response["status"] == 202:  # Accepted - pushing new config
                return _(
                    "Lockss-o-matic is updating the config to stop harvesting.  Please try again to delete local files."
                )
            if response["status"] == 204:  # No Content - no matching AIP
                return _("Package %(uuid)s is not found in LOCKSS") % {
                    "uuid": package.uuid
                }
            if response[
                    "status"] == 409:  # Conflict - Files in AU with recrawl
                return _(
                    "There are files in the LOCKSS Archival Unit (AU) that do not have 'recrawl=false'."
                )
            return _(
                "Error %(error)s when requesting LOCKSS stop harvesting deleted files."
            ) % {
                "error": response["status"]
            }
        return None
示例#6
0
    def _delete_update_lom(self, package, delete_lom_ids):
        """
        Notifys LOM that AUs with `delete_lom_ids` will be deleted.

        Helper to update_package_status.
        """
        # Update LOM that local copies will be deleted
        entry = sword2.Entry(id='urn:uuid:{}'.format(package.uuid))
        entry.register_namespace('lom', utils.NSMAP['lom'])
        for lom_id in delete_lom_ids:
            if lom_id:
                etree.SubElement(entry.entry,
                                 utils.PREFIX_NS['lom'] + 'content',
                                 recrawl='false').text = lom_id
        LOGGER.debug('edit entry: %s', entry)
        # SWORD2 client doesn't handle 202 respose correctly - implementing here
        # Correct function is self.sword_connection.update_metadata_for_resource
        headers = {
            'Content-Type': "application/atom+xml;type=entry",
            'Content-Length': str(len(str(entry))),
            'On-Behalf-Of': str(self.content_provider_id),
        }
        response, content = self.sword_connection.h.request(
            uri=package.misc_attributes['edit_iri'],
            method='PUT',
            headers=headers,
            payload=str(entry))

        # Return with error message if response not 200
        LOGGER.debug('response code: %s', response['status'])
        if response['status'] != 200:
            if response['status'] == 202:  # Accepted - pushing new config
                return 'Lockss-o-matic is updating the config to stop harvesting.  Please try again to delete local files.'
            if response['status'] == 204:  # No Content - no matching AIP
                return 'Package {} is not found in LOCKSS'.format(package.uuid)
            if response[
                    'status'] == 409:  # Conflict - Files in AU with recrawl
                return "There are files in the LOCKSS Archival Unit (AU) that do not have 'recrawl=false'."
            return 'Error {} when requesting LOCKSS stop harvesting deleted files.'.format(
                response['status'])
        return None
示例#7
0
    def _create_resource(self, package, output_files):
        """ Given a package, create an Atom resource entry to send to LOCKSS.

        Parses metadata for the Atom entry from the METS file, uses
        LOCKSS-o-matic-specific tags to describe size and checksums.
        """

        # Parse METS to get information for atom entry
        relative_mets_path = os.path.join(
            os.path.splitext(os.path.basename(package.current_path))[0],
            "data",
            'METS.{}.xml'.format(package.uuid))
        (mets_path, temp_dir) = package.extract_file(relative_mets_path)
        mets = etree.parse(mets_path)
        # Delete temp dir if created
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)

        # Parse out name and description if found
        slug = str(package.uuid)
        title = os.path.basename(package.current_path)
        summary = 'AIP generated by Archivematica with uuid {}'.format(package.uuid)
        dublincore = mets.find('mets:dmdSec/mets:mdWrap[@MDTYPE="DC"]/mets:xmlData/dcterms:dublincore', namespaces=utils.NSMAP)
        if dublincore is not None:
            title = dublincore.findtext('dcterms:title', namespaces=utils.NSMAP, default=title)
            slug = dublincore.findtext('dcterms:title', namespaces=utils.NSMAP, default=slug)
            summary = dublincore.findtext('dcterms:description', namespaces=utils.NSMAP, default=summary)
        # Parse out Agent for author
        authors = mets.xpath(".//mets:mdWrap[@MDTYPE='PREMIS:AGENT']//mets:agentType[text()='organization']/ancestor::mets:agent/*/mets:agentIdentifierValue", namespaces=utils.NSMAP)
        author = authors[0].text if authors else None

        # Create atom entry
        entry = sword2.Entry(
            title=title,
            id='urn:uuid:' + package.uuid,
            author={'name': author},
            summary=summary)

        # Add each chunk to the atom entry
        if not self.pointer_root:
            self.pointer_root = etree.parse(package.full_pointer_file_path)
        entry.register_namespace('lom', utils.NSMAP['lom'])
        for index, file_path in enumerate(output_files):
            # Get external URL
            if len(output_files) == 1:
                external_url = self._download_url(package.uuid)
            else:
                external_url = self._download_url(package.uuid, index + 1)

            # Get checksum and size from pointer file (or generate if not found)
            file_e = self.pointer_root.find(".//mets:fileGrp[@USE='LOCKSS chunk']/mets:file[@ID='{}']".format(os.path.basename(file_path)), namespaces=utils.NSMAP)
            if file_e is not None:
                checksum_name = file_e.get('CHECKSUMTYPE')
                checksum_value = file_e.get('CHECKSUM')
                size = int(file_e.get('SIZE'))
            else:
                # Not split, generate
                try:
                    checksum = utils.generate_checksum(file_path,
                        self.checksum_type)
                except ValueError:  # Invalid checksum type
                    checksum = utils.generate_checksum(file_path, 'md5')
                checksum_name = checksum.name.upper().replace('SHA', 'SHA-')
                checksum_value = checksum.hexdigest()
                size = os.path.getsize(file_path)

            # Convert size to kB
            size = str(math.ceil(size / 1000))

            # Add new content entry and values
            entry.add_field('lom_content', external_url)
            content_entry = entry.entry[-1]
            content_entry.set('size', size)
            content_entry.set('checksumType', checksum_name)
            content_entry.set('checksumValue', checksum_value)

        LOGGER.debug('LOCKSS atom entry: %s', entry)
        return entry, slug
示例#8
0
    def move_from_storage_service(self,
                                  source_path,
                                  destination_path,
                                  package=None):
        LOGGER.info('source_path: %s, destination_path: %s, package: %s',
                    source_path, destination_path, package)
        if package is None:
            LOGGER.warning('DSpace requires package param')
            return

        # This only handles compressed AIPs
        if not os.path.isfile(source_path):
            raise NotImplementedError(
                _('Storing in DSpace does not support uncompressed AIPs'))

        self._get_sword_connection()
        # Create item by depositing AtoM doc
        LOGGER.debug('Create SWORD2 entry')
        kwargs = self._get_metadata(source_path, package.uuid)
        entry = sword2.Entry(title=kwargs.get('dcterms_title'), **kwargs)

        destination_path = package.current_location.relative_path
        LOGGER.debug('POST SWORD2 entry %s %s', destination_path, entry)
        entry_receipt = self.sword_connection.create(
            col_iri=destination_path,
            in_progress=True,
            metadata_entry=entry,
        )

        # TODO store these in Package.misc_attributes
        LOGGER.info('Edit IRI: %s', entry_receipt.edit)
        LOGGER.info('Edit Media IRI: %s', entry_receipt.edit_media)
        LOGGER.info('Statement IRI: %s', entry_receipt.atom_statement_iri)

        # Split package
        upload_paths = self._split_package(source_path)

        for upload_path in upload_paths:
            LOGGER.info('Add file %s to %s', upload_path,
                        entry_receipt.edit_media)
            # Add file to DSpace item
            with open(upload_path, 'r') as f:
                content = f.read()  # sword2 iterates over this twice

            # Note: This has problems because httplib2 tries all requests using basic auth without any auth and retries after getting a 401. This breaks with files over 2097152 bytes.
            # A possible solution is to use a different http_impl in the connection, but that returns incorrect URIs in the deposit recept
            # LOGGER.debug('Using sword2')
            # self.sword_connection.add_file_to_resource(
            #     edit_media_iri=entry_receipt.edit_media,
            #     payload=content,
            #     filename=os.path.basename(upload_path),
            #     mimetype=mimetypes.guess_type(upload_path),
            # )

            # This replicates the sword2 behaviour but using requests for the basic auth
            LOGGER.debug('Using requests')
            headers = {
                'Content-Type':
                str(mimetypes.guess_type(upload_path)),
                # 'Content-MD5': str(md5sum),
                'Content-Length':
                str(os.path.getsize(upload_path)),
                'Content-Disposition':
                "attachment; filename=%s" %
                urllib.quote(os.path.basename(upload_path)),
            }
            requests.post(entry_receipt.edit_media,
                          headers=headers,
                          data=content,
                          auth=(self.user, self.password))

        # Finalize deposit
        LOGGER.info('Complete deposit for %s', entry_receipt.edit)
        try:
            complete_receipt = self.sword_connection.complete_deposit(
                dr=entry_receipt)
        except Exception:
            LOGGER.error(
                'Error creating item: Status: %s, response: %s',
                self.sword_connection.history[-1]['payload']
                ['response'].status,
                self.sword_connection.history[-1]['payload']['response'].resp)
            LOGGER.error(self.sword_connection.history[-1])
            raise
        LOGGER.info('Complete receipt: %s', complete_receipt)

        package.current_path = entry_receipt.atom_statement_iri
        package.save()

        # Fetch statement
        LOGGER.info(
            'Request Atom serialisation of the deposit statement from %s',
            entry_receipt.atom_statement_iri)
        try:
            statement = self.sword_connection.get_atom_sword_statement(
                entry_receipt.atom_statement_iri)
        except Exception:
            LOGGER.error(
                'Error creating item: Status: %s, response: %s',
                self.sword_connection.history[-1]['payload']
                ['response'].status,
                self.sword_connection.history[-1]['payload']['response'].resp)
            LOGGER.error(self.sword_connection.history[-1])
            raise
        LOGGER.info('Statement: %s', statement.xml_document)

        # Get DSpace handle
        regex = r'bitstream/(?P<handle>\d+/\d+)/'  # get Dspace handle regex
        match = re.search(regex, statement.original_deposits[0].id)
        if match:
            LOGGER.info('Handle: %s', match.group('handle'))
            handle = match.group('handle')
        else:
            LOGGER.warning('No match found in %s',
                           statement.original_deposits[0].id)
            return

        package.misc_attributes.update({'handle': handle})
        package.save()

        # Set permissions on metadata bitstreams
        self._set_permissions(package)