def test_fileinstance_copy_contents(app, db, dummy_location): """Test copy contents.""" counter = dict(called=False) def callback(total, size): counter['called'] = True # Create source and set data. data = b('this is some data') src = FileInstance.create() src.set_contents(BytesIO(data), location=dummy_location) db.session.commit() # Create destination - and use it to copy_contents from another object. dst = FileInstance.create() assert dst.size == 0 assert dst.uri is None db.session.commit() # Copy contents dst.copy_contents(src, progress_callback=callback, location=dummy_location) db.session.commit() assert dst.size == src.size assert dst.checksum == src.checksum assert dst.uri != src.uri assert counter['called'] # Read data fp = dst.storage().open() assert data == fp.read() fp.close()
def test_b2share_storage_with_pid(base_app, app, tmp_location, login_user, test_users): """Check that the storage class will redirect pid files.""" pid = 'http://hdl.handle.net/11304/74c66f0b-f814-4202-9dcb-4889ba9b1047' with app.app_context(): # Disable access control for this test tmp_location = Location.query.first() with db.session.begin_nested(): bucket = Bucket.create(tmp_location, storage_class='B') pid_file = FileInstance.create() pid_file.set_uri(pid, 1, 0, storage_class='B') ObjectVersion.create(bucket, 'test.txt', pid_file.id) db.session.commit() url = url_for('invenio_files_rest.object_api', bucket_id=bucket.id, key='test.txt') try: with app.app_context(): permission = current_files_rest.permission_factory current_files_rest.permission_factory = allow_all # Check that accessing the file redirects to the PID with app.test_client() as client: resp = client.get(url) assert resp.headers['Location'] == pid assert resp.status_code == 302 finally: with app.app_context(): current_files_rest.permission_factory = permission
def handle_record_files(data, bucket, files, skip_files): """Handles record files.""" for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file try: f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get( "size"), file.get("checksum")) obj = ObjectVersion.create( bucket, filename, _file_id=f.id ) file.update({ 'bucket': str(obj.bucket_id), 'checksum': obj.file.checksum, 'key': obj.key, 'version_id': str(obj.version_id), }) except Exception as e: click.echo( 'Recid {0} file {1} could not be loaded due ' 'to {2}.'.format(data.get('recid'), filename, str(e))) continue
def test_object_relink_all(app, db, dummy_location): """Test relinking files.""" b1 = Bucket.create() obj1 = ObjectVersion.create( b1, "relink-test", stream=BytesIO(b('relinkthis'))) ObjectVersion.create(b1, "do-not-touch", stream=BytesIO(b('na'))) b1.snapshot() db.session.commit() assert ObjectVersion.query.count() == 4 assert FileInstance.query.count() == 2 fnew = FileInstance.create() fnew.copy_contents(obj1.file, location=b1.location) db.session.commit() fold = obj1.file assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 2 assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 0 ObjectVersion.relink_all(obj1.file, fnew) db.session.commit() assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 0 assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 2
def test_pyfilesystemstorage(app, db, dummy_location): """Test pyfs storage.""" # Create bucket and object with db.session.begin_nested(): b1 = Bucket.create() obj = ObjectVersion.create(b1, "LICENSE") obj.file = FileInstance.create() storage = PyFilesystemStorage(obj.file, base_uri=obj.bucket.location.uri) counter = dict(size=0) def callback(total, size): counter['size'] = size data = b("this is some content") stream = BytesIO(data) loc, size, checksum = storage.save(stream, progress_callback=callback) # Verify checksum, size and location. m = hashlib.md5() m.update(data) assert "md5:{0}".format(m.hexdigest()) == checksum assert size == len(data) assert loc == join( dummy_location.uri, str(obj.file.id), "data")
def test_fileinstance_set_contents(app, db, dummy_location): """Test file instance create.""" counter = dict(called=False) def callback(total, size): counter['called'] = True f = FileInstance.create() db.session.commit() assert f.readable is False assert f.writable is True data = BytesIO(b("test file instance set contents")) f.set_contents( data, default_location=dummy_location.uri, progress_callback=callback) db.session.commit() assert f.readable is True assert f.writable is False assert counter['called'] pytest.raises( ValueError, f.set_contents, BytesIO(b("different content")), location=dummy_location, )
def test_pyfilesystemstorage_make_path(): """Test path for files.""" fi = FileInstance.create() fi.id = uuid.uuid5(uuid.NAMESPACE_DNS, 'Testing-') fs = PyFilesystemStorage(fi, base_uri='Base') assert 'Base/45/629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path() assert 'Base/4/5629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 1) assert 'Base/4/5/6/29316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(3, 1) assert 'Base/456/29316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 3) # If length 0, it should take the default value. assert 'Base/45/629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 0) # If dimensions are 0, it should take the default value. assert 'Base/4/5629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(0, 1) # Length of each partition is too long. with pytest.raises(AssertionError): fs.make_path(1, 50) # Number of partitions is too high. with pytest.raises(AssertionError): fs.make_path(50, 1) # Both values produce the exception. with pytest.raises(AssertionError): fs.make_path(50, 50)
def test_fileinstance_get(app, db, dummy_location): """Test fileinstance get.""" f = FileInstance.create() db.session.commit() # Get existing file. assert FileInstance.get(f.id) is not None # Non-existing files returns none assert FileInstance.get(uuid.uuid4()) is None
def test_fileinstance_create(app, db, dummy_location): """Test file instance create.""" f = FileInstance.create() assert f.id assert f.readable is False assert f.writable is True assert f.uri is None assert f.size == 0 assert f.checksum is None assert f.last_check_at is None assert f.last_check is None db.session.commit() # Check unique constraint on URI with none values. f = FileInstance.create() f = FileInstance.create() db.session.commit()
def data_policies(skip_files): """Load demo Data Policy records.""" from invenio_db import db from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_records_files.api import Record from invenio_records.models import RecordMetadata indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/data-policies-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data') data_policies_json = glob.glob(os.path.join(data, '*.json')) for filename in data_policies_json: click.echo('Loading data-policies from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() cernopendata_recid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) bucket = Bucket.create() RecordsBuckets.create( record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get( "size"), file.get("checksum")) ObjectVersion.create( bucket, filename, _file_id=f.id ) db.session.commit() indexer.index(record) db.session.expunge_all()
def test_storage_interface(): """Test storage interface.""" f = FileInstance.create() s = Storage(f) pytest.raises(NotImplementedError, s.open) pytest.raises(NotImplementedError, s.send_file) pytest.raises(NotImplementedError, s.save, None) pytest.raises(NotImplementedError, s.compute_checksum, None)
def test_fileinstance_get_by_uri(app, db, dummy_location): """Test file get by uri.""" f = FileInstance.create() f.uri = "LICENSE" db.session.commit() assert FileInstance.get_by_uri("LICENSE") is not None FileInstance.get_by_uri("NOTVALID") is None pytest.raises(AssertionError, FileInstance.get_by_uri, None)
def test_pyfs_send_file_fail(app, db, dummy_location): """Test send file.""" f = FileInstance.create() f.set_contents(BytesIO(b("test")), location=dummy_location) with patch('invenio_files_rest.storage.send_stream') as send_stream: send_stream.side_effect = OSError(errno.EPERM, "Permission problem") with app.test_request_context(): pytest.raises(StorageError, f.send_file)
def test_fileinstance_copy_contents_invalid(app, db, dummy_location): """Test invalid copy contents.""" # Source not readable src = FileInstance.create() dst = FileInstance.create() pytest.raises(ValueError, dst.copy_contents, src) # Create valid source data = b('this is some data') src = FileInstance.create() src.set_contents(BytesIO(data), location=dummy_location) db.session.commit() # Destination not writable dst.writable = False pytest.raises(ValueError, dst.copy_contents, src) # Size is not 0 dst.writable = True dst.size = 1 pytest.raises(ValueError, dst.copy_contents, src)
def test_sip_file_model(db): """Test the SIPFile model.""" sip1 = SIP.create('json', '{}') file1 = FileInstance.create() sipfile1 = SIPFile(sip_id=sip1.id, filepath="foobar.zip", file_id=file1.id) db.session.add(sipfile1) db.session.commit() assert SIP.query.count() == 1 assert SIPFile.query.count() == 1
def test_pyfilesystemstorage_checksum_fail(app, db, dummy_location): """Test fixity problems.""" # Raise an error during checksum calculation def callback(total, size): raise OSError(errno.EPERM, "Permission") f = FileInstance.create() f.set_contents(BytesIO(b("test")), location=dummy_location) pytest.raises( StorageError, PyFilesystemStorage(f).compute_checksum, progress_callback=callback)
def test_fileinstance_send_file(app, db, dummy_location): """Test file instance send file.""" f = FileInstance.create() # File not readable pytest.raises(ValueError, f.send_file) # Write data data = b("test file instance set contents") f.set_contents(BytesIO(data), location=dummy_location) db.session.commit() # Send data with app.test_request_context(): res = f.send_file() assert int(res.headers['Content-Length']) == len(data)
def create_file(self, bucket, key, file_versions): """Create a single file with all versions.""" objs = [] for file_ver in file_versions: f = FileInstance.create().set_uri( file_ver['full_path'], file_ver['size'], 'md5:{0}'.format(file_ver['checksum']), ) obj = ObjectVersion.create(bucket, key).set_file(f) obj.created = arrow.get( file_ver['creation_date']).datetime.replace(tzinfo=None) objs.append(obj) # Set head version db.session.commit() return objs[-1]
def loaddemofiles(source, force=False): """Load demo files.""" s = stat(source) with open(source, 'rb') as fp: m = hashlib.md5() m.update(fp.read()) checksum = "md5:{0}".format(m.hexdigest()) # Create a file instance with db.session.begin_nested(): f = FileInstance.create() f.set_uri(source, s.st_size, checksum) # Replace all objects associated files. ObjectVersion.query.update({ObjectVersion.file_id: str(f.id)}) db.session.commit()
def create_b2safe_file(external_pids, bucket): """Create a FileInstance which contains a PID in its uri.""" validate_schema(external_pids, { 'type': 'array', 'items': { 'type': 'object', 'properties': { 'ePIC_PID': {'type': 'string'}, 'key': {'type': 'string'} }, 'additionalProperties': False, 'required': ['ePIC_PID', 'key'] } }) keys_list = [e['key'] for e in external_pids] keys_set = set(keys_list) if len(keys_list) != len(keys_set): raise InvalidDepositError([FieldError('external_pids', 'Field external_pids contains duplicate keys.')]) for external_pid in external_pids: if not external_pid['ePIC_PID'].startswith('http://hdl.handle.net/'): external_pid['ePIC_PID'] = 'http://hdl.handle.net/' + \ external_pid['ePIC_PID'] if external_pid['key'].startswith('/'): raise InvalidDepositError( [FieldError('external_pids', 'File key cannot start with a "/".')]) try: # Create the file instance if it does not already exist file_instance = FileInstance.get_by_uri(external_pid['ePIC_PID']) if file_instance is None: file_instance = FileInstance.create() file_instance.set_uri( external_pid['ePIC_PID'], 1, 0, storage_class='B') assert file_instance.storage_class == 'B' # Add the file to the bucket if it is not already in it current_version = ObjectVersion.get(bucket, external_pid['key']) if not current_version or \ current_version.file_id != file_instance.id: ObjectVersion.create(bucket, external_pid['key'], file_instance.id) except IntegrityError as e: raise InvalidDepositError( [FieldError('external_pids', 'File URI already exists.')])
def test_pyfilesystemstorage(app, db, dummy_location): """Test pyfs storage.""" # Create bucket and object with db.session.begin_nested(): b1 = Bucket.create() obj = ObjectVersion.create(b1, "LICENSE") obj.file = FileInstance.create() storage = PyFilesystemStorage(obj.file, base_uri=obj.bucket.location.uri) counter = dict(size=0) def callback(total, size): counter['size'] = size def test_file_save(data, **kwargs): stream = BytesIO(data) loc, size, checksum = storage.save(stream, progress_callback=callback, **kwargs) # Verify checksum, size and location. m = hashlib.md5() m.update(data) assert "md5:{0}".format(m.hexdigest()) == checksum assert size == len(data) assert loc == join(dummy_location.uri, str(obj.file.id)[0:2], str(obj.file.id)[2:], 'data') data = b("this is some content") # test without size test_file_save(data) # test with correct size test_file_save(data, size=len(data)) # test with wrong sizes with pytest.raises(UnexpectedFileSizeError): test_file_save(data, size=len(data) - 1) with pytest.raises(UnexpectedFileSizeError): test_file_save(data, size=len(data) + 1)
def upload(self, pid=None, *args, **kwargs): """Upload action for file/repository.""" with UpdateDepositPermission(self).require(403): data = request.get_json() fileinfo = self._construct_fileinfo(data['url'], data['type']) if request: _, record = request.view_args.get('pid_value').data record_id = str(record.id) filename = fileinfo['filename'] obj = ObjectVersion.create( bucket=record.files.bucket, key=filename ) obj.file = FileInstance.create() record.files.flush() record.files[filename]['source_url'] = data['url'] if data['type'] == 'url': if data['url'].startswith( ('https://github', 'https://gitlab.cern.ch', 'root://')): download_url.delay(record_id, data['url'], fileinfo) else: raise FileUploadError( 'Please provide a valid file url.') else: if data['url'].startswith( ('https://github', 'https://gitlab.cern.ch')): download_repo.delay(record_id, data['url'], filename) else: raise FileUploadError( 'Please provide a valid repository url.') return self
def run(self, preset_quality, sleep_time=5, *args, **kwargs): """Launch video transcoding. For each of the presets generate a new ``ObjectVersion`` tagged as slave with the preset name as key and a link to the master version. :param self: reference to instance of task base class :param preset_quality: preset quality to use for transcoding. :param sleep_time: time interval between requests for the Sorenson status. """ self._base_payload.update(preset_quality=preset_quality) # Get master file's bucket_id bucket_id = self.object.bucket_id bucket_location = self.object.bucket.location.uri # Get master file's key master_key = self.object.key tags = self.object.get_tags() # Get master file's aspect ratio aspect_ratio = tags['display_aspect_ratio'] # Get master file's width x height width = int(tags['width']) if 'width' in tags else None height = int(tags['height']) if 'height' in tags else None with db.session.begin_nested(): # Create FileInstance file_instance = FileInstance.create() # Create ObjectVersion obj_key = self._build_slave_key(preset_quality=preset_quality, master_key=master_key) obj = ObjectVersion.create(bucket=bucket_id, key=obj_key) # Extract new location storage = file_instance.storage(default_location=bucket_location) directory, filename = storage._get_fs() input_file = self.object.file.uri # XRootDPyFS doesn't implement root_path try: output_file = os.path.join( directory.root_url + directory.base_path, filename) except AttributeError: output_file = os.path.join(directory.root_path, filename) try: # Start Sorenson job_id = start_encoding(input_file, output_file, preset_quality, aspect_ratio, max_height=height, max_width=width) except (InvalidResolutionError, TooHighResolutionError) as e: exception = self._meta_exception_envelope(exc=e) self.update_state(state=REVOKED, meta=exception) raise Ignore() # Set revoke handler, in case of an abrupt execution halt. self.set_revoke_handler(partial(stop_encoding, job_id)) # Create ObjectVersionTags ObjectVersionTag.create(obj, 'master', self.obj_id) ObjectVersionTag.create(obj, '_sorenson_job_id', job_id) ObjectVersionTag.create(obj, 'preset_quality', preset_quality) ObjectVersionTag.create(obj, 'media_type', 'video') ObjectVersionTag.create(obj, 'context_type', 'subformat') preset_info = get_preset_info(aspect_ratio, preset_quality) for key, value in preset_info.items(): ObjectVersionTag.create(obj, key, value) # Information necessary for monitoring job_info = dict( preset_quality=preset_quality, job_id=job_id, file_instance=str(file_instance.id), uri=output_file, version_id=str(obj.version_id), key=obj_key, tags=obj.get_tags(), percentage=0, ) db.session.commit() self.update_state(state=STARTED, meta=dict(payload=dict(**job_info), message='Started transcoding.')) status = '' # Monitor job and report accordingly while status != 'Finished': # Get job status status, percentage = get_encoding_status(job_id) if status == 'Error': raise RuntimeError('Error transcoding') job_info['percentage'] = percentage # Update task's state for this preset self.update_state(state=STARTED, meta=dict(payload=dict(**job_info), message='{1} {0}'.format( status, percentage))) time.sleep(sleep_time) # Set file's location, if job has completed self._clean_file_name(output_file) with db.session.begin_nested(): uri = output_file with file_opener_xrootd(uri, 'rb') as transcoded_file: digest = hashlib.md5(transcoded_file.read()).hexdigest() size = os.path.getsize(replace_xrootd(uri)) checksum = '{0}:{1}'.format('md5', digest) file_instance.set_uri(uri, size, checksum) as_object_version(job_info['version_id']).set_file(file_instance) db.session.commit()
def test_fileinstance_validation(app, db, dummy_location): """Test validating the FileInstance.""" f = FileInstance.create() f.set_uri('x' * 255, 1000, 1000) # Should not raise pytest.raises(ValueError, f.set_uri, 'x' * 256, 1000, 1000)
def sips(db, locations, sip_metadata_types): """Fixture for the SIP objects sharing multiple files. Four SIPs are sharing three files in the following way: SIP-1: File1 SIP-2: File1, File2 SIP-3: File2(renamed on SIPFile, but same FileInstance), File3 SIP-4: File4, File5, File6 """ sip1 = SIP.create() sip1api = SIPApi(sip1) sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>') sip1api.attach_metadata('json-test', '{"title": "JSON 1"}') # Metadata 'txt-test', although attached should not be archived # (see conftest configuration) sip1api.attach_metadata('txt-test', 'Title: TXT 1') file1 = FileInstance.create() file1.set_contents(BytesIO(b('test')), default_location=locations['default'].uri) sip1file1 = SIPFile(sip_id=sip1.id, filepath="foobar.txt", file_id=file1.id) db_.session.add(sip1file1) sip2 = SIP.create() sip2api = SIPApi(sip2) sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>') sip2api.attach_metadata('json-test', '{"title": "JSON 2"}') file2 = FileInstance.create() file2.set_contents(BytesIO(b'test-second'), default_location=locations['default'].uri) sip2file1 = SIPFile(sip_id=sip2.id, filepath="foobar.txt", file_id=file1.id) sip2file2 = SIPFile(sip_id=sip2.id, filepath="foobar2.txt", file_id=file2.id) db_.session.add(sip2file1) db_.session.add(sip2file2) sip3 = SIP.create() sip3api = SIPApi(sip3) sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>') sip3api.attach_metadata('json-test', '{"title": "JSON 3"}') file3 = FileInstance.create() file3.set_contents(BytesIO(b'test-third'), default_location=locations['default'].uri) sip3file2 = SIPFile(sip_id=sip3.id, filepath="foobar2-renamed.txt", file_id=file2.id) sip3file3 = SIPFile(sip_id=sip3.id, filepath="foobar3.txt", file_id=file3.id) db_.session.add(sip3file2) db_.session.add(sip3file3) # A SIP with naughty filenames sip4 = SIP.create() sip4api = SIPApi(sip4) sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>') sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}') file4 = FileInstance.create() file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')), default_location=locations['default'].uri) file5 = FileInstance.create() file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')), default_location=locations['default'].uri) file6 = FileInstance.create() file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')), default_location=locations['default'].uri) sip5file4 = SIPFile(sip_id=sip4.id, filepath="../../foobar.txt", file_id=file4.id) sip5file5 = SIPFile(sip_id=sip4.id, filepath="http://maliciouswebsite.com/hack.js", file_id=file5.id) sip5file6 = SIPFile(sip_id=sip4.id, filepath="łóżźćąę.dat", file_id=file6.id) db_.session.add(sip5file4) db_.session.add(sip5file5) db_.session.add(sip5file6) # A SIP with metadata-only changes sip5 = SIP.create() sip5api = SIPApi(sip5) sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>') db_.session.commit() return [sip1api, sip2api, sip3api, sip4api, sip5api]
def create_files_and_sip(deposit, dep_pid): """Create deposit Bucket, Files and SIPs.""" from invenio_pidstore.errors import PIDDoesNotExistError from invenio_pidstore.models import PersistentIdentifier, PIDStatus from invenio_sipstore.errors import SIPUserDoesNotExist from invenio_sipstore.models import SIP, RecordSIP, SIPFile from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_db import db buc = Bucket.create() recbuc = RecordsBuckets(record_id=deposit.id, bucket_id=buc.id) db.session.add(recbuc) deposit.setdefault('_deposit', dict()) deposit.setdefault('_buckets', dict(deposit=str(buc.id))) deposit.setdefault('_files', list()) files = deposit.get('files', []) sips = deposit.get('sips', []) # Look for prereserved DOI (and recid) if 'drafts' in deposit: drafts = list(deposit['drafts'].items()) if len(drafts) != 1: logger.exception('Deposit {dep_pid} has multiple drafts'.format( dep_pid=dep_pid)) if len(drafts) == 1: draft_type, draft = drafts[0] draft_v = draft['values'] if 'prereserve_doi' in draft_v: pre_recid = str(draft_v['prereserve_doi']['recid']) pre_doi = str(draft_v['prereserve_doi']['doi']) # If pre-reserve info available, try to reserve 'recid' try: pid = PersistentIdentifier.get(pid_type='recid', pid_value=str(pre_recid)) except PIDDoesNotExistError: # Reserve recid pid = PersistentIdentifier.create( pid_type='recid', pid_value=str(pre_recid), object_type='rec', status=PIDStatus.RESERVED) # If pre-reserve info available, try to reserve 'doi' try: pid = PersistentIdentifier.get(pid_type='doi', pid_value=str(pre_doi)) except PIDDoesNotExistError: # Reserve DOI pid = PersistentIdentifier.create( pid_type='doi', pid_value=str(pre_doi), object_type='rec', status=PIDStatus.RESERVED) if RecordIdentifier.query.get(int(pre_recid)) is None: RecordIdentifier.insert(int(pre_recid)) # Store the path -> FileInstance mappings for SIPFile creation later dep_file_instances = list() for file_ in files: size = file_['size'] key = file_['name'] # Warning: Assumes all checksums are MD5! checksum = 'md5:{0}'.format(file_['checksum']) fi = FileInstance.create() fi.set_uri(file_['path'], size, checksum) ov = ObjectVersion.create(buc, key, _file_id=fi.id) ext = splitext(ov.key)[1].lower() if ext.startswith('.'): ext = ext[1:] file_meta = dict( bucket=str(ov.bucket.id), key=ov.key, checksum=ov.file.checksum, size=ov.file.size, version_id=str(ov.version_id), type=ext, ) deposit['_files'].append(file_meta) dep_file_instances.append((file_['path'], fi)) # Get a recid from SIP information recid = None if sips: recids = [int(sip['metadata']['recid']) for sip in sips] if len(set(recids)) > 1: logger.error('Multiple recids ({recids}) found in deposit {depid}' ' does not exists.'.format(recids=recids, depid=dep_pid.pid_value)) raise DepositMultipleRecids(dep_pid.pid_value, list(set(recids))) elif recids: # If only one recid recid = recids[0] for idx, sip in enumerate(sips): agent = None user_id = None if sip['agents']: agent = dict( ip_address=empty_str_if_none(sip['agents'][0].get( 'ip_address', "")), email=empty_str_if_none(sip['agents'][0].get( 'email_address', "")), ) user_id = sip['agents'][0]['user_id'] if user_id == 0: user_id = None content = sip['package'] sip_format = 'marcxml' try: sip = SIP.create(sip_format, content, user_id=user_id, agent=agent) except SIPUserDoesNotExist: logger.exception('User ID {user_id} referred in deposit {depid} ' 'does not exists.'.format( user_id=user_id, depid=dep_pid.pid_value)) sip = SIP.create(sip_format, content, agent=agent) # Attach recid to SIP if recid: try: pid = PersistentIdentifier.get(pid_type='recid', pid_value=str(recid)) record_sip = RecordSIP(sip_id=sip.id, pid_id=pid.id) db.session.add(record_sip) except PIDDoesNotExistError: logger.exception('Record {recid} referred in ' 'Deposit {depid} does not exists.'.format( recid=recid, depid=dep_pid.pid_value)) if deposit['_p']['submitted'] == True: logger.exception('Pair {recid}/{depid} was submitted,' ' (should it be unpublished?).'.format( recid=recid, depid=dep_pid.pid_value)) else: logger.exception( 'Pair {recid}/{depid} was not submitted.'.format( recid=recid, depid=dep_pid.pid_value)) # Reserve recid pid = PersistentIdentifier.create(pid_type='recid', pid_value=str(recid), object_type='rec', status=PIDStatus.RESERVED) if RecordIdentifier.query.get(int(recid)) is None: RecordIdentifier.insert(int(recid)) if idx == 0: for fp, fi in dep_file_instances: sipf = SIPFile(sip_id=sip.id, filepath=fp, file_id=fi.id) db.session.add(sipf) deposit.commit() return deposit
def create_files_and_sip(deposit, dep_pid): """Create deposit Bucket, Files and SIPs.""" from invenio_pidstore.errors import PIDDoesNotExistError from invenio_pidstore.models import PersistentIdentifier from invenio_sipstore.errors import SIPUserDoesNotExist from invenio_sipstore.models import SIP, RecordSIP, SIPFile from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets from invenio_db import db buc = Bucket.create() recbuc = RecordsBuckets(record_id=deposit.id, bucket_id=buc.id) db.session.add(recbuc) deposit.setdefault('_deposit', dict()) deposit.setdefault('_files', list()) files = deposit.get('files', []) sips = deposit.get('sips', []) recid = None if sips: recids = [int(sip['metadata']['recid']) for sip in sips] if len(set(recids)) > 1: logger.error('Multiple recids ({recids}) found in deposit {depid}' ' does not exists.'.format(recids=recids, depid=dep_pid.pid_value)) raise DepositMultipleRecids(dep_pid.pid_value, list(set(recids))) elif recids: # If only one recid recid = recids[0] # Store the path -> FileInstance mappings for SIPFile creation later dep_file_instances = list() for file_ in files: fi = FileInstance.create() fi.set_uri(file_['path'], file_['size'], file_['checksum']) ov = ObjectVersion.create(buc, file_['name'], _file_id=fi.id) file_meta = dict( bucket=str(buc.id), key=file_['name'], checksum=file_['checksum'], size=file_['size'], version_id=str(ov.version_id), ) deposit['_files'].append(file_meta) dep_file_instances.append((file_['path'], fi)) for idx, sip in enumerate(sips): agent = None user_id = None if sip['agents']: agent = dict( ip_address=sip['agents'][0].get('ip_address', ""), email=sip['agents'][0].get('email_address', ""), ) user_id = sip['agents'][0]['user_id'] content = sip['package'] sip_format = 'marcxml' try: sip = SIP.create(sip_format, content, user_id=user_id, agent=agent) except SIPUserDoesNotExist: logger.exception('User ID {user_id} referred in deposit {depid} ' 'does not exists.'.format( user_id=user_id, depid=dep_pid.pid_value)) raise DepositSIPUserDoesNotExist(dep_pid.pid_value, user_id) # If recid was found, attach it to SIP # TODO: This is always uses the first recid, as we quit if multiple # recids are found in the sips information if recid: try: pid = PersistentIdentifier.get(pid_type='recid', pid_value=recid) record_sip = RecordSIP(sip_id=sip.id, pid_id=pid.id) db.session.add(record_sip) except PIDDoesNotExistError: logger.exception('Record {recid} referred in ' 'Deposit {depid} does not exists.'.format( recid=recid, depid=dep_pid.pid_value)) raise DepositRecidDoesNotExist(dep_pid.pid_value, recid) if idx == 0: for fp, fi in dep_file_instances: sipf = SIPFile(sip_id=sip.id, filepath=fp, file_id=fi.id) db.session.add(sipf) deposit.commit() db.session.commit() return deposit
def datasets(skip_files): """Load demo datasets records.""" from invenio_db import db from invenio_records_files.api import Record from invenio_indexer.api import RecordIndexer from cernopendata.modules.records.minters.recid import \ cernopendata_recid_minter from cernopendata.modules.records.minters.datasetid import \ cernopendata_datasetid_minter from invenio_files_rest.models import \ Bucket, FileInstance, ObjectVersion from invenio_records_files.models import RecordsBuckets indexer = RecordIndexer() schema = current_app.extensions['invenio-jsonschemas'].path_to_url( 'records/datasets-v1.0.0.json' ) data = pkg_resources.resource_filename('cernopendata', 'modules/fixtures/data/datasets') datasets_json = glob.glob(os.path.join(data, '*.json')) # FIXME: change the treatment of `files` according to `records` fixtures. for filename in datasets_json: click.echo('Loading datasets from {0} ...'.format(filename)) with open(filename, 'rb') as source: for data in json.load(source): files = data.pop('files', []) id = uuid.uuid4() # (TOFIX) Remove if statement in production # as every dataset record should have a doi if data.get('doi', None): cernopendata_datasetid_minter(id, data) else: cernopendata_recid_minter(id, data) data['$schema'] = schema record = Record.create(data, id_=id) bucket = Bucket.create() RecordsBuckets.create( record=record.model, bucket=bucket) for file in files: if skip_files: break assert 'uri' in file assert 'size' in file assert 'checksum' in file f = FileInstance.create() filename = file.get("uri").split('/')[-1:][0] f.set_uri(file.get("uri"), file.get( "size"), file.get("checksum")) ObjectVersion.create( bucket, filename, _file_id=f.id ) db.session.commit() indexer.index(record) db.session.expunge_all()
def test_fileinstance_validation(app, db, dummy_location): """Test validating the FileInstance.""" f = FileInstance.create() f.set_uri('x' * 255, 1000, 1000) # Should not raise pytest.raises(ValueError, f.set_uri, 'x' * 256, 1000, 1000)
def sips(db, locations, sip_metadata_types): """Fixture for the SIP objects sharing multiple files. Four SIPs are sharing three files in the following way: SIP-1: File1 SIP-2: File1, File2 SIP-3: File2(renamed on SIPFile, but same FileInstance), File3 SIP-4: File4, File5, File6 """ # A SIP with agent info sip1 = SIP.create(agent={ 'email': '*****@*****.**', 'orcid': '1111-1111-1111-1111', 'ip_address': '1.1.1.1' }) sip1api = SIPApi(sip1) sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>') sip1api.attach_metadata('json-test', '{"title": "JSON 1"}') # Metadata 'txt-test', although attached should not be archived # (see conftest configuration) sip1api.attach_metadata('txt-test', 'Title: TXT 1') file1 = FileInstance.create() file1.set_contents(BytesIO(b('test')), default_location=locations['default'].uri) sip1file1 = SIPFile(sip_id=sip1.id, filepath="foobar.txt", file_id=file1.id) db_.session.add(sip1file1) sip2 = SIP.create() sip2api = SIPApi(sip2) sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>') sip2api.attach_metadata('json-test', '{"title": "JSON 2"}') file2 = FileInstance.create() file2.set_contents(BytesIO(b'test-second'), default_location=locations['default'].uri) sip2file1 = SIPFile(sip_id=sip2.id, filepath="foobar.txt", file_id=file1.id) sip2file2 = SIPFile(sip_id=sip2.id, filepath="foobar2.txt", file_id=file2.id) db_.session.add(sip2file1) db_.session.add(sip2file2) sip3 = SIP.create() sip3api = SIPApi(sip3) sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>') sip3api.attach_metadata('json-test', '{"title": "JSON 3"}') file3 = FileInstance.create() file3.set_contents(BytesIO(b'test-third'), default_location=locations['default'].uri) sip3file2 = SIPFile(sip_id=sip3.id, filepath="foobar2-renamed.txt", file_id=file2.id) sip3file3 = SIPFile(sip_id=sip3.id, filepath="foobar3.txt", file_id=file3.id) db_.session.add(sip3file2) db_.session.add(sip3file3) # A SIP with naughty filenames sip4 = SIP.create() sip4api = SIPApi(sip4) sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>') sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}') file4 = FileInstance.create() file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')), default_location=locations['default'].uri) file5 = FileInstance.create() file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')), default_location=locations['default'].uri) file6 = FileInstance.create() file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')), default_location=locations['default'].uri) sip5file4 = SIPFile(sip_id=sip4.id, filepath="../../foobar.txt", file_id=file4.id) sip5file5 = SIPFile(sip_id=sip4.id, filepath="http://maliciouswebsite.com/hack.js", file_id=file5.id) sip5file6 = SIPFile(sip_id=sip4.id, filepath="łóżźćąę.dat", file_id=file6.id) db_.session.add(sip5file4) db_.session.add(sip5file5) db_.session.add(sip5file6) # A SIP with metadata-only changes sip5 = SIP.create() sip5api = SIPApi(sip5) sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>') db_.session.commit() return [sip1api, sip2api, sip3api, sip4api, sip5api]