def test_get_checksum(): """Test the function _get_checksum.""" with pytest.raises(AttributeError): BagItArchiver._get_checksum('sha1:12') with pytest.raises(AttributeError): BagItArchiver._get_checksum('md5') assert BagItArchiver._get_checksum('md5:12') == '12'
def archive_sip(sip_uuid): """Send the SIP for archiving. Retries every 4 hours, six times, which should work for up to 24 hours archiving system downtime. :param sip_uuid: UUID of the SIP for archiving. :type sip_uuid: str """ try: sip = SIPApi(SIP.query.get(sip_uuid)) archiver = BagItArchiver(sip) bagmeta = archiver.get_bagit_metadata(sip) if bagmeta is None: raise ArchivingError( 'Bagit metadata does not exist for SIP: {0}.'.format(sip.id)) if sip.archived: raise ArchivingError( 'SIP was already archived {0}.'.format(sip.id)) archiver.write_all_files() sip.archived = True db.session.commit() except Exception as exc: # On ArchivingError (see above), do not retry, but re-raise if not isinstance(exc, ArchivingError): archive_sip.retry(exc=exc) raise
def test_constructor(sips): """Test the archiver constructor.""" s = BaseArchiver(sips[0].model).sip s2 = BaseArchiver(sips[0]).sip assert isinstance(s, SIPApi) assert isinstance(s2, SIPApi) a = BagItArchiver(sips[1], patch_of=sips[0]) a2 = BagItArchiver(sips[1].model, patch_of=sips[0].model) assert isinstance(a.sip, SIPApi) assert isinstance(a.patch_of, SIPApi) assert isinstance(a2.sip, SIPApi) assert isinstance(a2.patch_of, SIPApi)
def test_write_all_files(sips, archive_fs): """Test the functions used to create an export of the SIP.""" sip = sips[0] archiver = BagItArchiver(sip) assert not len(archive_fs.listdir()) archiver.write_all_files() assert len(archive_fs.listdir()) == 1 fs = archive_fs.opendir(archiver.get_archive_subpath()) assert set(fs.listdir()) == \ set(['tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data', ]) assert set(fs.listdir('data')) == \ set(['metadata', 'files', 'filenames.txt']) assert set(fs.listdir('data/metadata')) == \ set(['marcxml-test.xml', 'json-test.json', ]) assert set(fs.listdir('data/files')) == set(['foobar.txt', ])
def publish(self, pid=None, id_=None, user_id=None, sip_agent=None, spam_check=True): """Publish the Zenodo deposit.""" self['owners'] = self['_deposit']['owners'] self.validate_publish() if spam_check: self.spam_check() is_first_publishing = not self.is_published() deposit = super(ZenodoDeposit, self).publish(pid, id_) recid, record = deposit.fetch_published() pv = PIDVersioning(child=recid) is_new_version = pv.children.count() > 1 # a) Fetch the last SIP from the previous version if it's a new version # b) Fetch the previous SIP if publishing the metadata edit if is_new_version or (not is_first_publishing): if is_new_version: sip_recid = pv.children.all()[-2] else: # (not is_first_publishing) sip_recid = recid # Get the last SIP of the relevant recid, i.e.: either last # version or the current one sip_patch_of = (db.session.query(SIPModel).join( RecordSIPModel, RecordSIPModel.sip_id == SIPModel.id).filter( RecordSIPModel.pid_id == sip_recid.id).order_by( SIPModel.created.desc()).first()) else: sip_patch_of = None recordsip = RecordSIP.create(recid, record, archivable=True, create_sip_files=is_first_publishing, user_id=user_id, agent=sip_agent) archiver = BagItArchiver( recordsip.sip, include_all_previous=(not is_first_publishing), patch_of=sip_patch_of) archiver.save_bagit_metadata() return deposit
def publish(self, pid=None, id_=None, user_id=None, sip_agent=None): """Publish the Zenodo deposit.""" self['owners'] = self['_deposit']['owners'] self.validate_publish() is_first_publishing = not self.is_published() deposit = super(ZenodoDeposit, self).publish(pid, id_) recid, record = deposit.fetch_published() pv = PIDVersioning(child=recid) is_new_version = pv.children.count() > 1 # a) Fetch the last SIP from the previous version if it's a new version # b) Fetch the previous SIP if publishing the metadata edit if is_new_version or (not is_first_publishing): if is_new_version: sip_recid = pv.children.all()[-2] else: # (not is_first_publishing) sip_recid = recid # Get the last SIP of the relevant recid, i.e.: either last # version or the current one sip_patch_of = ( db.session.query(SIPModel) .join(RecordSIPModel, RecordSIPModel.sip_id == SIPModel.id) .filter(RecordSIPModel.pid_id == sip_recid.id) .order_by(SIPModel.created.desc()) .first() ) else: sip_patch_of = None recordsip = RecordSIP.create( recid, record, archivable=True, create_sip_files=is_first_publishing, user_id=user_id, agent=sip_agent) archiver = BagItArchiver( recordsip.sip, include_all_previous=(not is_first_publishing), patch_of=sip_patch_of) archiver.save_bagit_metadata() return deposit
def test_save_bagit_metadata(sips): """Test saving of bagit metadata.""" sip = sips[0] assert not BagItArchiver.get_bagit_metadata(sip) archiver = BagItArchiver(sip) archiver.save_bagit_metadata() bmeta = BagItArchiver.get_bagit_metadata(sip, as_dict=True) file_m = next(f for f in bmeta['files'] if 'sipfilepath' in f) assert file_m['sipfilepath'] == 'foobar.txt' assert file_m['filepath'] == 'data/files/foobar.txt' sip.model.sip_files[0].filepath = 'changed.txt' with pytest.raises(Exception) as excinfo: archiver.save_bagit_metadata() assert 'Attempting to save' in str(excinfo.value) archiver.save_bagit_metadata(overwrite=True) bmeta = BagItArchiver.get_bagit_metadata(sip, as_dict=True) file_m = next(f for f in bmeta['files'] if 'sipfilepath' in f) assert file_m['sipfilepath'] == 'changed.txt' assert file_m['filepath'] == 'data/files/changed.txt'
def test_get_all_files(sips): """Test the function get_all_files.""" archiver = BagItArchiver(sips[0]) files = archiver.get_all_files() assert len(files) == 8
def test_write_patched(mocker, sips, archive_fs, secure_sipfile_name_formatter): """Test the BagIt archiving with previous SIP as a base.""" # Mock the bagging date generation so the 'Bagging-Date' tag is predefined dt = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f") mocker.patch('invenio_sipstore.archivers.bagit_archiver.BagItArchiver.' '_generate_bagging_date', return_value=dt) arch1 = BagItArchiver(sips[0]) arch1.write_all_files() arch2 = BagItArchiver(sips[1], patch_of=sips[0]) arch2.write_all_files() arch3 = BagItArchiver(sips[2], patch_of=sips[1], include_all_previous=True) arch3.write_all_files() arch5 = BagItArchiver(sips[4], patch_of=sips[2], include_all_previous=True) arch5.write_all_files() # NOTE: We take only SIP-1, SIP-2, SIP-3 and SIP-5. # Enumeration of related objects follows the "sips" fixture naming fs1 = archive_fs.opendir(arch1.get_archive_subpath()) fs2 = archive_fs.opendir(arch2.get_archive_subpath()) fs3 = archive_fs.opendir(arch3.get_archive_subpath()) fs5 = archive_fs.opendir(arch5.get_archive_subpath()) assert len(fs1.listdir()) == 5 assert len(fs2.listdir()) == 6 # Includes 'fetch.txt' assert len(fs3.listdir()) == 6 # Includes 'fetch.txt' assert len(fs5.listdir()) == 6 # Includes 'fetch.txt' # Check SIP-1,2,3,5 data contents assert set(fs1.listdir('data')) == \ set(['files', 'metadata', 'filenames.txt']) assert len(fs1.listdir('data/files')) == 1 assert len(fs1.listdir('data/metadata')) == 2 assert set(fs2.listdir('data')) == \ set(['files', 'metadata', 'filenames.txt']) assert len(fs2.listdir('data/files')) == 1 assert len(fs2.listdir('data/metadata')) == 2 assert set(fs3.listdir('data')) == \ set(['files', 'metadata', 'filenames.txt']) assert len(fs3.listdir('data/files')) == 1 assert len(fs3.listdir('data/metadata')) == 2 assert set(fs5.listdir('data')) == \ set(['metadata', 'filenames.txt']) assert len(fs5.listdir('data/metadata')) == 1 # Fetch the filenames for easier fixture formatting below file1_fn = '{0}-foobar.txt'.format( fetch_file_endswith(sips[0], 'foobar.txt').file_id) file2_fn = '{0}-foobar2.txt'.format( fetch_file_endswith(sips[1], 'foobar2.txt').file_id) file3_fn = '{0}-foobar3.txt'.format( fetch_file_endswith(sips[2], 'foobar3.txt').file_id) file2_rn_fn = '{0}-foobar2-renamed.txt'.format( fetch_file_endswith(sips[2], 'foobar2-renamed.txt').file_id) assert file2_fn[:36] == file2_rn_fn[:36] # Both file2_fn and file2_rn_fn are referring to the same FileInstance, # so their UUID prefix should match expected_sip1 = [ ('data/files/{0}'.format(file1_fn), 'test'), ('data/metadata/marcxml-test.xml', '<p>XML 1</p>'), ('data/metadata/json-test.json', '{"title": "JSON 1"}'), ('bagit.txt', 'BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8'), ('manifest-md5.txt', set([ "{checksum} {filepath}".format( **_read_file(fs1, 'data/files/{0}'.format(file1_fn))), "{checksum} {filepath}".format( **_read_file(fs1, 'data/metadata/marcxml-test.xml')), "{checksum} {filepath}".format( **_read_file(fs1, 'data/metadata/json-test.json')), "{checksum} {filepath}".format( **_read_file(fs1, 'data/filenames.txt')), ])), ('data/filenames.txt', set([ '{0} foobar.txt'.format(file1_fn), ])), ('bag-info.txt', ( "Source-Organization: European Organization for Nuclear Research\n" "Organization-Address: CERN, CH-1211 Geneva 23, Switzerland\n" "Bagging-Date: {0}\n".format(dt) + "Payload-Oxum: 93.4\n" "External-Identifier: {0}/SIPBagIt-v1.0.0\n".format(sips[0].id) + "External-Description: BagIt archive of SIP." )), ] expected_sip2 = [ ('data/files/{0}'.format(file2_fn), 'test-second'), ('data/metadata/marcxml-test.xml', '<p>XML 2</p>'), ('data/metadata/json-test.json', '{"title": "JSON 2"}'), ('bagit.txt', 'BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8'), ('fetch.txt', set(["{0} {1} {2}".format( fs1.getsyspath('data/files/{0}'.format(file1_fn)), 4, 'data/files/{0}'.format(file1_fn)), ])), ('manifest-md5.txt', set([ "{checksum} {filepath}".format( **_read_file(fs1, 'data/files/{0}'.format(file1_fn))), "{checksum} {filepath}".format( **_read_file(fs2, 'data/files/{0}'.format(file2_fn))), "{checksum} {filepath}".format( **_read_file(fs2, 'data/metadata/marcxml-test.xml')), "{checksum} {filepath}".format( **_read_file(fs2, 'data/metadata/json-test.json')), "{checksum} {filepath}".format( **_read_file(fs2, 'data/filenames.txt')), ])), ('data/filenames.txt', set([ '{0} foobar.txt'.format(file1_fn), '{0} foobar2.txt'.format(file2_fn), ])), ('bag-info.txt', ( "Source-Organization: European Organization for Nuclear Research\n" "Organization-Address: CERN, CH-1211 Geneva 23, Switzerland\n" "Bagging-Date: {0}\n".format(dt) + "Payload-Oxum: 165.5\n" "External-Identifier: {0}/SIPBagIt-v1.0.0\n".format(sips[1].id) + "External-Description: BagIt archive of SIP." )), ] expected_sip3 = [ ('data/files/{0}'.format(file3_fn), 'test-third'), ('data/metadata/marcxml-test.xml', '<p>XML 3</p>'), ('data/metadata/json-test.json', '{"title": "JSON 3"}'), ('bagit.txt', 'BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8'), ('fetch.txt', set([ "{0} {1} {2}".format( fs1.getsyspath('data/files/{0}'.format(file1_fn)), 4, 'data/files/{0}'.format(file1_fn)), # Explanation on entry below: The file is fetched using original # filename (file2_fn) as it will be archived in SIP-2, however # the new destination has the 'renamed' filename (file2_rn_fn). # This is correct and expected behaviour "{0} {1} {2}".format( fs2.getsyspath('data/files/{0}'.format(file2_fn)), 11, 'data/files/{0}'.format(file2_rn_fn)), ])), ('manifest-md5.txt', set([ "{checksum} {filepath}".format( **_read_file(fs1, 'data/files/{0}'.format(file1_fn))), # Manifest also specifies the renamed filename for File-2 "{checksum} data/files/{newfilename}".format( newfilename=file2_rn_fn, **_read_file(fs2, 'data/files/{0}'.format(file2_fn))), "{checksum} {filepath}".format( **_read_file(fs3, 'data/files/{0}'.format(file3_fn))), "{checksum} {filepath}".format( **_read_file(fs3, 'data/metadata/marcxml-test.xml')), "{checksum} {filepath}".format( **_read_file(fs3, 'data/metadata/json-test.json')), "{checksum} {filepath}".format( **_read_file(fs3, 'data/filenames.txt')), ])), ('data/filenames.txt', set([ '{0} foobar.txt'.format(file1_fn), '{0} foobar2.txt'.format(file2_fn), '{0} foobar3.txt'.format(file3_fn), ])), ('bag-info.txt', ( "Source-Organization: European Organization for Nuclear Research\n" "Organization-Address: CERN, CH-1211 Geneva 23, Switzerland\n" "Bagging-Date: {0}\n".format(dt) + "Payload-Oxum: 236.6\n" "External-Identifier: {0}/SIPBagIt-v1.0.0\n".format(sips[2].id) + "External-Description: BagIt archive of SIP." )), ] expected_sip5 = [ ('data/metadata/marcxml-test.xml', '<p>XML 5 Meta Only</p>'), ('bagit.txt', 'BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8'), ('fetch.txt', set([ "{0} {1} {2}".format( fs1.getsyspath('data/files/{0}'.format(file1_fn)), 4, 'data/files/{0}'.format(file1_fn)), # As in "expected_sip3" above, the file is fetched using original # filename (file2_fn) as it will be archived in SIP-2, however # the new destination has the 'renamed' filename (file2_rn_fn). # This is correct and expected behaviour "{0} {1} {2}".format( fs2.getsyspath('data/files/{0}'.format(file2_fn)), 11, 'data/files/{0}'.format(file2_rn_fn)), "{0} {1} {2}".format( fs3.getsyspath('data/files/{0}'.format(file3_fn)), 10, 'data/files/{0}'.format(file3_fn)), ])), ('manifest-md5.txt', set([ "{checksum} {filepath}".format( **_read_file(fs1, 'data/files/{0}'.format(file1_fn))), # Manifest also specifies the renamed filename for File-2 "{checksum} data/files/{newfilename}".format( newfilename=file2_rn_fn, **_read_file(fs2, 'data/files/{0}'.format(file2_fn))), "{checksum} {filepath}".format( **_read_file(fs3, 'data/files/{0}'.format(file3_fn))), "{checksum} {filepath}".format( **_read_file(fs5, 'data/metadata/marcxml-test.xml')), "{checksum} {filepath}".format( **_read_file(fs5, 'data/filenames.txt')), ])), ('data/filenames.txt', set([ '{0} foobar.txt'.format(file1_fn), '{0} foobar2.txt'.format(file2_fn), '{0} foobar3.txt'.format(file3_fn), ])), ('bag-info.txt', ( "Source-Organization: European Organization for Nuclear Research\n" "Organization-Address: CERN, CH-1211 Geneva 23, Switzerland\n" "Bagging-Date: {0}\n".format(dt) + "Payload-Oxum: 227.5\n" "External-Identifier: {0}/SIPBagIt-v1.0.0\n".format(sips[4].id) + "External-Description: BagIt archive of SIP." )), ] for fs, expected in [(fs1, expected_sip1), (fs2, expected_sip2), (fs3, expected_sip3), (fs5, expected_sip5)]: for fn, exp_content in expected: with fs.open(fn) as fp: if isinstance(exp_content, set): content = set(fp.read().splitlines()) else: content = fp.read() assert content == exp_content
def test_archiving(app, db, deposit, deposit_file, locations, archive_fs): """Test ZenodoSIP archiving.""" # Stash the configuration and enable writing orig = app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = True deposit.files['test2.txt'] = BytesIO(b'test-two') deposit_v1 = publish_and_expunge(db, deposit) recid_v1, record_v1 = deposit_v1.fetch_published() recid_v1_id = recid_v1.id # Record files after publishing: 'test.txt', 'test2.txt' sip1 = SIP(SIPModel.query.one()) sip1_id = sip1.id # Edit the metadata deposit_v1 = deposit_v1.edit() deposit_v1['title'] = "New title" deposit_v1 = publish_and_expunge(db, deposit_v1) # Record files after publishing: 'test.txt', 'test2.txt' sip2_id = SIPModel.query.order_by(SIPModel.created.desc()).first().id # Create a new version deposit_v1.newversion() recid_v1 = PersistentIdentifier.query.get(recid_v1_id) pv = PIDVersioning(child=recid_v1) depid_v2 = pv.draft_child_deposit deposit_v2 = ZenodoDeposit.get_record(depid_v2.object_uuid) del deposit_v2.files['test.txt'] deposit_v2.files['test3.txt'] = BytesIO(b('test-three')) deposit_v2 = publish_and_expunge(db, deposit_v2) # Record files after publishing: 'test2.txt', 'test3.txt' sip1 = SIP(SIPModel.query.get(sip1_id)) sip2 = SIP(SIPModel.query.get(sip2_id)) sip3 = SIP(SIPModel.query.order_by(SIPModel.created.desc()).first()) # Becase we are using secure_filename when writing SIPFiles we need to # genenarate the correct names: <SIPFile.id>-<secure_filename> s1_file1_fn = '{0}-test.txt'.format(fetch_suff(sip1, 'test.txt').file_id) s1_file1_fp = 'data/files/{0}'.format(s1_file1_fn) s1_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip1, 'test2.txt').file_id) s1_file2_fp = 'data/files/{0}'.format(s1_file2_fn) s3_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip3, 'test2.txt').file_id) s3_file2_fp = 'data/files/{0}'.format(s3_file2_fn) s3_file3_fn = '{0}-test3.txt'.format(fetch_suff(sip3, 'test3.txt').file_id) s3_file3_fp = 'data/files/{0}'.format(s3_file3_fn) sip1_bagmeta = json.loads( next(m.content for m in sip1.metadata if m.type.name == 'bagit'))['files'] sip2_bagmeta = json.loads( next(m.content for m in sip2.metadata if m.type.name == 'bagit'))['files'] sip3_bagmeta = json.loads( next(m.content for m in sip3.metadata if m.type.name == 'bagit'))['files'] # Check if Bagit metadata contains the correct file-fetching information assert set([f['filepath'] for f in sip1_bagmeta]) == \ set([s1_file1_fp, s1_file2_fp, 'data/filenames.txt', 'data/metadata/record-json.json', 'bag-info.txt', 'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt']) assert not BagItArchiver._is_fetched(get_m_item(sip1_bagmeta, s1_file1_fp)) assert not BagItArchiver._is_fetched(get_m_item(sip1_bagmeta, s1_file2_fp)) assert set([f['filepath'] for f in sip2_bagmeta]) == \ set([s1_file1_fp, s1_file2_fp, 'data/filenames.txt', 'data/metadata/record-json.json', 'bag-info.txt', 'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt', 'fetch.txt']) # Both files should be fetched since it's only metadata-edit submission assert BagItArchiver._is_fetched(get_m_item(sip2_bagmeta, s1_file1_fp)) assert BagItArchiver._is_fetched(get_m_item(sip2_bagmeta, s1_file2_fp)) assert set([f['filepath'] for f in sip3_bagmeta]) == \ set([s3_file2_fp, s3_file3_fp, 'data/filenames.txt', 'data/metadata/record-json.json', 'bag-info.txt', 'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt', 'fetch.txt']) # First file should be fetched from previous version and new file should # be archived in this bag. assert BagItArchiver._is_fetched(get_m_item(sip3_bagmeta, s3_file2_fp)) assert not BagItArchiver._is_fetched(get_m_item(sip3_bagmeta, s3_file3_fp)) archiver1 = BagItArchiver(sip1) archiver2 = BagItArchiver(sip2) archiver3 = BagItArchiver(sip3) # Each archiver subpath follows: '<recid>/r/<ISO-8601-SIP-timestamp>' sip1_ts = arrow.get(sip1.model.created).isoformat() sip2_ts = arrow.get(sip2.model.created).isoformat() sip3_ts = arrow.get(sip3.model.created).isoformat() assert archiver1.get_archive_subpath() == '2/r/{0}'.format(sip1_ts) assert archiver2.get_archive_subpath() == '2/r/{0}'.format(sip2_ts) assert archiver3.get_archive_subpath() == '3/r/{0}'.format(sip3_ts) # As a test, write the SIPs in reverse chronological order assert not sip1.archived assert not sip2.archived assert not sip3.archived archive_sip.delay(sip3.id) archive_sip.delay(sip2.id) archive_sip.delay(sip1.id) assert sip1.archived assert sip2.archived assert sip3.archived fs1 = archive_fs.opendir(archiver1.get_archive_subpath()) assert set(fs1.listdir()) == set([ 'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data' ]) assert set(fs1.listdir('data')) == set( ['metadata', 'files', 'filenames.txt']) assert fs1.listdir('data/metadata') == [ 'record-json.json', ] assert set(fs1.listdir('data/files')) == set([s1_file1_fn, s1_file2_fn]) fs2 = archive_fs.opendir(archiver2.get_archive_subpath()) assert set(fs2.listdir()) == set([ 'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data', 'fetch.txt' ]) # Second SIP has written only the metadata, # because of that There should be no 'files/', but 'filesnames.txt' should # still be there becasue of the fetch.txt assert set(fs2.listdir('data')) == set(['metadata', 'filenames.txt']) assert fs2.listdir('data/metadata') == [ 'record-json.json', ] with fs2.open('fetch.txt') as fp: cnt = fp.read().splitlines() # Fetched files should correctly fetch the files from the first archive base_uri = archiver1.get_archive_base_uri() assert set(cnt) == set([ '{base}/2/r/{s1ts}/{fn} 4 {fn}'.format(fn=s1_file1_fp, base=base_uri, s1ts=sip1_ts), '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s1_file2_fp, base=base_uri, s1ts=sip1_ts), ]) fs3 = archive_fs.opendir(archiver3.get_archive_subpath()) assert set(fs3.listdir()) == set([ 'tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data', 'fetch.txt' ]) # Third SIP should write only the extra 'test3.txt' file assert set(fs3.listdir('data')) == set( ['metadata', 'files', 'filenames.txt']) assert fs3.listdir('data/metadata') == [ 'record-json.json', ] assert fs3.listdir('data/files') == [ s3_file3_fn, ] with fs3.open('fetch.txt') as fp: cnt = fp.read().splitlines() # Since 'file.txt' was removed in third SIP, we should only fetch the # 'test2.txt', also from the first archive, since that's where this # file resides physically. base_uri = archiver1.get_archive_base_uri() assert set(cnt) == set([ '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s3_file2_fp, base=base_uri, s1ts=sip1_ts), ]) app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = orig
def test_archiving(app, db, deposit, deposit_file, locations, archive_fs): """Test ZenodoSIP archiving.""" # Stash the configuration and enable writing orig = app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = True deposit.files['test2.txt'] = BytesIO(b'test-two') deposit_v1 = publish_and_expunge(db, deposit) recid_v1, record_v1 = deposit_v1.fetch_published() recid_v1_id = recid_v1.id # Record files after publishing: 'test.txt', 'test2.txt' sip1 = SIP(SIPModel.query.one()) sip1_id = sip1.id # Edit the metadata deposit_v1 = deposit_v1.edit() deposit_v1['title'] = "New title" deposit_v1 = publish_and_expunge(db, deposit_v1) # Record files after publishing: 'test.txt', 'test2.txt' sip2_id = SIPModel.query.order_by(SIPModel.created.desc()).first().id # Create a new version deposit_v1.newversion() recid_v1 = PersistentIdentifier.query.get(recid_v1_id) pv = PIDVersioning(child=recid_v1) depid_v2 = pv.draft_child_deposit deposit_v2 = ZenodoDeposit.get_record(depid_v2.object_uuid) del deposit_v2.files['test.txt'] deposit_v2.files['test3.txt'] = BytesIO(b('test-three')) deposit_v2 = publish_and_expunge(db, deposit_v2) # Record files after publishing: 'test2.txt', 'test3.txt' sip1 = SIP(SIPModel.query.get(sip1_id)) sip2 = SIP(SIPModel.query.get(sip2_id)) sip3 = SIP(SIPModel.query.order_by(SIPModel.created.desc()).first()) # Becase we are using secure_filename when writing SIPFiles we need to # genenarate the correct names: <SIPFile.id>-<secure_filename> s1_file1_fn = '{0}-test.txt'.format(fetch_suff(sip1, 'test.txt').file_id) s1_file1_fp = 'data/files/{0}'.format(s1_file1_fn) s1_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip1, 'test2.txt').file_id) s1_file2_fp = 'data/files/{0}'.format(s1_file2_fn) s3_file2_fn = '{0}-test2.txt'.format(fetch_suff(sip3, 'test2.txt').file_id) s3_file2_fp = 'data/files/{0}'.format(s3_file2_fn) s3_file3_fn = '{0}-test3.txt'.format(fetch_suff(sip3, 'test3.txt').file_id) s3_file3_fp = 'data/files/{0}'.format(s3_file3_fn) sip1_bagmeta = json.loads(next( m.content for m in sip1.metadata if m.type.name == 'bagit'))['files'] sip2_bagmeta = json.loads(next( m.content for m in sip2.metadata if m.type.name == 'bagit'))['files'] sip3_bagmeta = json.loads(next( m.content for m in sip3.metadata if m.type.name == 'bagit'))['files'] # Check if Bagit metadata contains the correct file-fetching information assert set([f['filepath'] for f in sip1_bagmeta]) == \ set([s1_file1_fp, s1_file2_fp, 'data/filenames.txt', 'data/metadata/record-json.json', 'bag-info.txt', 'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt']) assert not BagItArchiver._is_fetched( get_m_item(sip1_bagmeta, s1_file1_fp)) assert not BagItArchiver._is_fetched( get_m_item(sip1_bagmeta, s1_file2_fp)) assert set([f['filepath'] for f in sip2_bagmeta]) == \ set([s1_file1_fp, s1_file2_fp, 'data/filenames.txt', 'data/metadata/record-json.json', 'bag-info.txt', 'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt', 'fetch.txt']) # Both files should be fetched since it's only metadata-edit submission assert BagItArchiver._is_fetched( get_m_item(sip2_bagmeta, s1_file1_fp)) assert BagItArchiver._is_fetched( get_m_item(sip2_bagmeta, s1_file2_fp)) assert set([f['filepath'] for f in sip3_bagmeta]) == \ set([s3_file2_fp, s3_file3_fp, 'data/filenames.txt', 'data/metadata/record-json.json', 'bag-info.txt', 'manifest-md5.txt', 'bagit.txt', 'tagmanifest-md5.txt', 'fetch.txt']) # First file should be fetched from previous version and new file should # be archived in this bag. assert BagItArchiver._is_fetched( get_m_item(sip3_bagmeta, s3_file2_fp)) assert not BagItArchiver._is_fetched( get_m_item(sip3_bagmeta, s3_file3_fp)) archiver1 = BagItArchiver(sip1) archiver2 = BagItArchiver(sip2) archiver3 = BagItArchiver(sip3) # Each archiver subpath follows: '<recid>/r/<ISO-8601-SIP-timestamp>' sip1_ts = arrow.get(sip1.model.created).isoformat() sip2_ts = arrow.get(sip2.model.created).isoformat() sip3_ts = arrow.get(sip3.model.created).isoformat() assert archiver1.get_archive_subpath() == '2/r/{0}'.format(sip1_ts) assert archiver2.get_archive_subpath() == '2/r/{0}'.format(sip2_ts) assert archiver3.get_archive_subpath() == '3/r/{0}'.format(sip3_ts) # As a test, write the SIPs in reverse chronological order assert not sip1.archived assert not sip2.archived assert not sip3.archived archive_sip.delay(sip3.id) archive_sip.delay(sip2.id) archive_sip.delay(sip1.id) assert sip1.archived assert sip2.archived assert sip3.archived fs1 = archive_fs.opendir(archiver1.get_archive_subpath()) assert set(fs1.listdir()) == set(['tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data']) assert set(fs1.listdir('data')) == set(['metadata', 'files', 'filenames.txt']) assert fs1.listdir('data/metadata') == ['record-json.json', ] assert set(fs1.listdir('data/files')) == set([s1_file1_fn, s1_file2_fn]) fs2 = archive_fs.opendir(archiver2.get_archive_subpath()) assert set(fs2.listdir()) == set(['tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data', 'fetch.txt']) # Second SIP has written only the metadata, # because of that There should be no 'files/', but 'filesnames.txt' should # still be there becasue of the fetch.txt assert set(fs2.listdir('data')) == set(['metadata', 'filenames.txt']) assert fs2.listdir('data/metadata') == ['record-json.json', ] with fs2.open('fetch.txt') as fp: cnt = fp.read().splitlines() # Fetched files should correctly fetch the files from the first archive base_uri = archiver1.get_archive_base_uri() assert set(cnt) == set([ '{base}/2/r/{s1ts}/{fn} 4 {fn}'.format(fn=s1_file1_fp, base=base_uri, s1ts=sip1_ts), '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s1_file2_fp, base=base_uri, s1ts=sip1_ts), ]) fs3 = archive_fs.opendir(archiver3.get_archive_subpath()) assert set(fs3.listdir()) == set(['tagmanifest-md5.txt', 'bagit.txt', 'manifest-md5.txt', 'bag-info.txt', 'data', 'fetch.txt']) # Third SIP should write only the extra 'test3.txt' file assert set(fs3.listdir('data')) == set(['metadata', 'files', 'filenames.txt']) assert fs3.listdir('data/metadata') == ['record-json.json', ] assert fs3.listdir('data/files') == [s3_file3_fn, ] with fs3.open('fetch.txt') as fp: cnt = fp.read().splitlines() # Since 'file.txt' was removed in third SIP, we should only fetch the # 'test2.txt', also from the first archive, since that's where this # file resides physically. base_uri = archiver1.get_archive_base_uri() assert set(cnt) == set([ '{base}/2/r/{s1ts}/{fn} 8 {fn}'.format(fn=s3_file2_fp, base=base_uri, s1ts=sip1_ts), ]) app.config['SIPSTORE_ARCHIVER_WRITING_ENABLED'] = orig