Пример #1
0
def test_fileinstance_copy_contents(app, db, dummy_location):
    """Test copy contents."""
    counter = dict(called=False)

    def callback(total, size):
        counter['called'] = True

    # Create source and set data.
    data = b('this is some data')
    src = FileInstance.create()
    src.set_contents(BytesIO(data), location=dummy_location)
    db.session.commit()

    # Create destination - and use it to copy_contents from another object.
    dst = FileInstance.create()
    assert dst.size == 0
    assert dst.uri is None
    db.session.commit()

    # Copy contents
    dst.copy_contents(src, progress_callback=callback, location=dummy_location)
    db.session.commit()
    assert dst.size == src.size
    assert dst.checksum == src.checksum
    assert dst.uri != src.uri
    assert counter['called']

    # Read data
    fp = dst.storage().open()
    assert data == fp.read()
    fp.close()
Пример #2
0
def test_b2share_storage_with_pid(base_app, app, tmp_location, login_user, test_users):
    """Check that the storage class will redirect pid files."""
    pid = 'http://hdl.handle.net/11304/74c66f0b-f814-4202-9dcb-4889ba9b1047'
    with app.app_context():
        # Disable access control for this test
        tmp_location = Location.query.first()
        with db.session.begin_nested():
            bucket = Bucket.create(tmp_location, storage_class='B')
            pid_file = FileInstance.create()
            pid_file.set_uri(pid, 1, 0, storage_class='B')
            ObjectVersion.create(bucket, 'test.txt', pid_file.id)

        db.session.commit()
        url = url_for('invenio_files_rest.object_api',
                        bucket_id=bucket.id,
                        key='test.txt')
    try:
        with app.app_context():
            permission = current_files_rest.permission_factory
            current_files_rest.permission_factory = allow_all
        # Check that accessing the file redirects to the PID
        with app.test_client() as client:
            resp = client.get(url)
            assert resp.headers['Location'] == pid
            assert resp.status_code == 302
    finally:
        with app.app_context():
            current_files_rest.permission_factory = permission
Пример #3
0
def handle_record_files(data, bucket, files, skip_files):
    """Handles record files."""
    for file in files:
        if skip_files:
            break
        assert 'uri' in file
        assert 'size' in file
        assert 'checksum' in file

        try:
            f = FileInstance.create()
            filename = file.get("uri").split('/')[-1:][0]
            f.set_uri(file.get("uri"), file.get(
                "size"), file.get("checksum"))
            obj = ObjectVersion.create(
                bucket,
                filename,
                _file_id=f.id
            )

            file.update({
                'bucket': str(obj.bucket_id),
                'checksum': obj.file.checksum,
                'key': obj.key,
                'version_id': str(obj.version_id),
            })

        except Exception as e:
            click.echo(
                'Recid {0} file {1} could not be loaded due '
                'to {2}.'.format(data.get('recid'), filename,
                                 str(e)))
            continue
Пример #4
0
def test_object_relink_all(app, db, dummy_location):
    """Test relinking files."""
    b1 = Bucket.create()
    obj1 = ObjectVersion.create(
        b1, "relink-test", stream=BytesIO(b('relinkthis')))
    ObjectVersion.create(b1, "do-not-touch", stream=BytesIO(b('na')))
    b1.snapshot()
    db.session.commit()

    assert ObjectVersion.query.count() == 4
    assert FileInstance.query.count() == 2

    fnew = FileInstance.create()
    fnew.copy_contents(obj1.file, location=b1.location)
    db.session.commit()

    fold = obj1.file

    assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 2
    assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 0

    ObjectVersion.relink_all(obj1.file, fnew)
    db.session.commit()

    assert ObjectVersion.query.filter_by(file_id=fold.id).count() == 0
    assert ObjectVersion.query.filter_by(file_id=fnew.id).count() == 2
Пример #5
0
def test_pyfilesystemstorage(app, db, dummy_location):
    """Test pyfs storage."""
    # Create bucket and object
    with db.session.begin_nested():
        b1 = Bucket.create()
        obj = ObjectVersion.create(b1, "LICENSE")
        obj.file = FileInstance.create()

    storage = PyFilesystemStorage(obj.file, base_uri=obj.bucket.location.uri)
    counter = dict(size=0)

    def callback(total, size):
        counter['size'] = size

    data = b("this is some content")
    stream = BytesIO(data)
    loc, size, checksum = storage.save(stream, progress_callback=callback)

    # Verify checksum, size and location.
    m = hashlib.md5()
    m.update(data)
    assert "md5:{0}".format(m.hexdigest()) == checksum

    assert size == len(data)
    assert loc == join(
        dummy_location.uri,
        str(obj.file.id),
        "data")
Пример #6
0
def test_fileinstance_set_contents(app, db, dummy_location):
    """Test file instance create."""
    counter = dict(called=False)

    def callback(total, size):
        counter['called'] = True

    f = FileInstance.create()
    db.session.commit()
    assert f.readable is False
    assert f.writable is True
    data = BytesIO(b("test file instance set contents"))
    f.set_contents(
        data, default_location=dummy_location.uri, progress_callback=callback)
    db.session.commit()
    assert f.readable is True
    assert f.writable is False
    assert counter['called']

    pytest.raises(
        ValueError,
        f.set_contents,
        BytesIO(b("different content")),
        location=dummy_location,
    )
Пример #7
0
def test_pyfilesystemstorage_make_path():
    """Test path for files."""
    fi = FileInstance.create()
    fi.id = uuid.uuid5(uuid.NAMESPACE_DNS, 'Testing-')
    fs = PyFilesystemStorage(fi, base_uri='Base')
    assert 'Base/45/629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path()
    assert 'Base/4/5629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 1)
    assert 'Base/4/5/6/29316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(3, 1)
    assert 'Base/456/29316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 3)

    # If length 0, it should take the default value.
    assert 'Base/45/629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(1, 0)

    # If dimensions are 0, it should take the default value.
    assert 'Base/4/5629316-6e69-5006-82ba-1ee2f18df5b2' == fs.make_path(0, 1)

    # Length of each partition is too long.
    with pytest.raises(AssertionError):
        fs.make_path(1, 50)

    # Number of partitions is too high.
    with pytest.raises(AssertionError):
        fs.make_path(50, 1)

    # Both values produce the exception.
    with pytest.raises(AssertionError):
        fs.make_path(50, 50)
Пример #8
0
def test_fileinstance_get(app, db, dummy_location):
    """Test fileinstance get."""
    f = FileInstance.create()
    db.session.commit()
    # Get existing file.
    assert FileInstance.get(f.id) is not None
    # Non-existing files returns none
    assert FileInstance.get(uuid.uuid4()) is None
Пример #9
0
def test_fileinstance_create(app, db, dummy_location):
    """Test file instance create."""
    f = FileInstance.create()
    assert f.id
    assert f.readable is False
    assert f.writable is True
    assert f.uri is None
    assert f.size == 0
    assert f.checksum is None
    assert f.last_check_at is None
    assert f.last_check is None
    db.session.commit()

    # Check unique constraint on URI with none values.
    f = FileInstance.create()
    f = FileInstance.create()
    db.session.commit()
Пример #10
0
def data_policies(skip_files):
    """Load demo Data Policy records."""
    from invenio_db import db
    from invenio_indexer.api import RecordIndexer
    from cernopendata.modules.records.minters.recid import \
        cernopendata_recid_minter

    from invenio_files_rest.models import \
        Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets
    from invenio_records_files.api import Record

    from invenio_records.models import RecordMetadata

    indexer = RecordIndexer()
    schema = current_app.extensions['invenio-jsonschemas'].path_to_url(
        'records/data-policies-v1.0.0.json'
    )
    data = pkg_resources.resource_filename('cernopendata',
                                           'modules/fixtures/data')
    data_policies_json = glob.glob(os.path.join(data, '*.json'))

    for filename in data_policies_json:

        click.echo('Loading data-policies from {0} ...'.format(filename))

        with open(filename, 'rb') as source:
            for data in json.load(source):
                files = data.pop('files', [])

                id = uuid.uuid4()
                cernopendata_recid_minter(id, data)
                data['$schema'] = schema
                record = Record.create(data, id_=id)

                bucket = Bucket.create()
                RecordsBuckets.create(
                    record=record.model, bucket=bucket)

                for file in files:
                    if skip_files:
                        break
                    assert 'uri' in file
                    assert 'size' in file
                    assert 'checksum' in file

                    f = FileInstance.create()
                    filename = file.get("uri").split('/')[-1:][0]
                    f.set_uri(file.get("uri"), file.get(
                        "size"), file.get("checksum"))
                    ObjectVersion.create(
                        bucket,
                        filename,
                        _file_id=f.id
                    )
                db.session.commit()
                indexer.index(record)
                db.session.expunge_all()
def test_storage_interface():
    """Test storage interface."""
    f = FileInstance.create()
    s = Storage(f)

    pytest.raises(NotImplementedError, s.open)
    pytest.raises(NotImplementedError, s.send_file)
    pytest.raises(NotImplementedError, s.save, None)
    pytest.raises(NotImplementedError, s.compute_checksum, None)
Пример #12
0
def test_fileinstance_get_by_uri(app, db, dummy_location):
    """Test file get by uri."""
    f = FileInstance.create()
    f.uri = "LICENSE"
    db.session.commit()

    assert FileInstance.get_by_uri("LICENSE") is not None
    FileInstance.get_by_uri("NOTVALID") is None
    pytest.raises(AssertionError, FileInstance.get_by_uri, None)
def test_pyfs_send_file_fail(app, db, dummy_location):
    """Test send file."""
    f = FileInstance.create()
    f.set_contents(BytesIO(b("test")), location=dummy_location)

    with patch('invenio_files_rest.storage.send_stream') as send_stream:
        send_stream.side_effect = OSError(errno.EPERM, "Permission problem")
        with app.test_request_context():
            pytest.raises(StorageError, f.send_file)
Пример #14
0
def test_fileinstance_copy_contents_invalid(app, db, dummy_location):
    """Test invalid copy contents."""
    # Source not readable
    src = FileInstance.create()
    dst = FileInstance.create()
    pytest.raises(ValueError, dst.copy_contents, src)

    # Create valid source
    data = b('this is some data')
    src = FileInstance.create()
    src.set_contents(BytesIO(data), location=dummy_location)
    db.session.commit()

    # Destination not writable
    dst.writable = False
    pytest.raises(ValueError, dst.copy_contents, src)
    # Size is not 0
    dst.writable = True
    dst.size = 1
    pytest.raises(ValueError, dst.copy_contents, src)
def test_sip_file_model(db):
    """Test the SIPFile model."""
    sip1 = SIP.create('json', '{}')
    file1 = FileInstance.create()
    sipfile1 = SIPFile(sip_id=sip1.id, filepath="foobar.zip",
                       file_id=file1.id)

    db.session.add(sipfile1)
    db.session.commit()
    assert SIP.query.count() == 1
    assert SIPFile.query.count() == 1
def test_pyfilesystemstorage_checksum_fail(app, db, dummy_location):
    """Test fixity problems."""
    # Raise an error during checksum calculation
    def callback(total, size):
        raise OSError(errno.EPERM, "Permission")

    f = FileInstance.create()
    f.set_contents(BytesIO(b("test")), location=dummy_location)

    pytest.raises(
        StorageError, PyFilesystemStorage(f).compute_checksum,
        progress_callback=callback)
Пример #17
0
def test_fileinstance_send_file(app, db, dummy_location):
    """Test file instance send file."""
    f = FileInstance.create()
    # File not readable
    pytest.raises(ValueError, f.send_file)

    # Write data
    data = b("test file instance set contents")
    f.set_contents(BytesIO(data), location=dummy_location)
    db.session.commit()

    # Send data
    with app.test_request_context():
        res = f.send_file()
        assert int(res.headers['Content-Length']) == len(data)
Пример #18
0
    def create_file(self, bucket, key, file_versions):
        """Create a single file with all versions."""
        objs = []
        for file_ver in file_versions:
            f = FileInstance.create().set_uri(
                file_ver['full_path'],
                file_ver['size'],
                'md5:{0}'.format(file_ver['checksum']),
            )
            obj = ObjectVersion.create(bucket, key).set_file(f)
            obj.created = arrow.get(
                file_ver['creation_date']).datetime.replace(tzinfo=None)
            objs.append(obj)

        # Set head version
        db.session.commit()
        return objs[-1]
Пример #19
0
def loaddemofiles(source, force=False):
    """Load demo files."""
    s = stat(source)

    with open(source, 'rb') as fp:
        m = hashlib.md5()
        m.update(fp.read())
        checksum = "md5:{0}".format(m.hexdigest())

    # Create a file instance
    with db.session.begin_nested():
        f = FileInstance.create()
        f.set_uri(source, s.st_size, checksum)

    # Replace all objects associated files.
    ObjectVersion.query.update({ObjectVersion.file_id: str(f.id)})
    db.session.commit()
Пример #20
0
def create_b2safe_file(external_pids, bucket):
    """Create a FileInstance which contains a PID in its uri."""
    validate_schema(external_pids, {
        'type': 'array',
        'items': {
            'type': 'object',
            'properties': {
                'ePIC_PID': {'type': 'string'},
                'key': {'type': 'string'}
            },
            'additionalProperties': False,
            'required': ['ePIC_PID', 'key']
        }
    })

    keys_list = [e['key'] for e in external_pids]
    keys_set = set(keys_list)
    if len(keys_list) != len(keys_set):
        raise InvalidDepositError([FieldError('external_pids',
            'Field external_pids contains duplicate keys.')])
    for external_pid in external_pids:
        if not external_pid['ePIC_PID'].startswith('http://hdl.handle.net/'):
            external_pid['ePIC_PID'] = 'http://hdl.handle.net/' + \
                external_pid['ePIC_PID']
        if external_pid['key'].startswith('/'):
            raise InvalidDepositError(
                [FieldError('external_pids',
                            'File key cannot start with a "/".')])
        try:
            # Create the file instance if it does not already exist
            file_instance = FileInstance.get_by_uri(external_pid['ePIC_PID'])
            if file_instance is None:
                file_instance = FileInstance.create()
                file_instance.set_uri(
                    external_pid['ePIC_PID'], 1, 0, storage_class='B')
            assert file_instance.storage_class == 'B'
            # Add the file to the bucket if it is not already in it
            current_version = ObjectVersion.get(bucket, external_pid['key'])
            if not current_version or \
                    current_version.file_id != file_instance.id:
                ObjectVersion.create(bucket, external_pid['key'],
                                     file_instance.id)
        except IntegrityError as e:
            raise InvalidDepositError(
                [FieldError('external_pids', 'File URI already exists.')])
Пример #21
0
def test_pyfilesystemstorage(app, db, dummy_location):
    """Test pyfs storage."""
    # Create bucket and object
    with db.session.begin_nested():
        b1 = Bucket.create()
        obj = ObjectVersion.create(b1, "LICENSE")
        obj.file = FileInstance.create()

    storage = PyFilesystemStorage(obj.file, base_uri=obj.bucket.location.uri)
    counter = dict(size=0)

    def callback(total, size):
        counter['size'] = size

    def test_file_save(data, **kwargs):
        stream = BytesIO(data)
        loc, size, checksum = storage.save(stream, progress_callback=callback,
                                           **kwargs)

        # Verify checksum, size and location.
        m = hashlib.md5()
        m.update(data)
        assert "md5:{0}".format(m.hexdigest()) == checksum

        assert size == len(data)
        assert loc == join(dummy_location.uri, str(obj.file.id)[0:2],
                           str(obj.file.id)[2:], 'data')

    data = b("this is some content")
    # test without size
    test_file_save(data)
    # test with correct size
    test_file_save(data, size=len(data))
    # test with wrong sizes
    with pytest.raises(UnexpectedFileSizeError):
        test_file_save(data, size=len(data) - 1)
    with pytest.raises(UnexpectedFileSizeError):
        test_file_save(data, size=len(data) + 1)
    def upload(self, pid=None, *args, **kwargs):
        """Upload action for file/repository."""
        with UpdateDepositPermission(self).require(403):
            data = request.get_json()

            fileinfo = self._construct_fileinfo(data['url'],
                                                data['type'])
            if request:
                _, record = request.view_args.get('pid_value').data
                record_id = str(record.id)
                filename = fileinfo['filename']
                obj = ObjectVersion.create(
                    bucket=record.files.bucket, key=filename
                )
                obj.file = FileInstance.create()
                record.files.flush()
                record.files[filename]['source_url'] = data['url']

                if data['type'] == 'url':
                    if data['url'].startswith(
                            ('https://github',
                             'https://gitlab.cern.ch',
                             'root://')):
                        download_url.delay(record_id, data['url'], fileinfo)
                    else:
                        raise FileUploadError(
                            'Please provide a valid file url.')
                else:
                    if data['url'].startswith(
                            ('https://github', 'https://gitlab.cern.ch')):
                        download_repo.delay(record_id, data['url'], filename)
                    else:
                        raise FileUploadError(
                            'Please provide a valid repository url.')

            return self
Пример #23
0
    def run(self, preset_quality, sleep_time=5, *args, **kwargs):
        """Launch video transcoding.

        For each of the presets generate a new ``ObjectVersion`` tagged as
        slave with the preset name as key and a link to the master version.

        :param self: reference to instance of task base class
        :param preset_quality: preset quality to use for transcoding.
        :param sleep_time: time interval between requests for the Sorenson
            status.
        """
        self._base_payload.update(preset_quality=preset_quality)

        # Get master file's bucket_id
        bucket_id = self.object.bucket_id
        bucket_location = self.object.bucket.location.uri
        # Get master file's key
        master_key = self.object.key

        tags = self.object.get_tags()
        # Get master file's aspect ratio
        aspect_ratio = tags['display_aspect_ratio']
        # Get master file's width x height
        width = int(tags['width']) if 'width' in tags else None
        height = int(tags['height']) if 'height' in tags else None

        with db.session.begin_nested():
            # Create FileInstance
            file_instance = FileInstance.create()

            # Create ObjectVersion
            obj_key = self._build_slave_key(preset_quality=preset_quality,
                                            master_key=master_key)
            obj = ObjectVersion.create(bucket=bucket_id, key=obj_key)

            # Extract new location
            storage = file_instance.storage(default_location=bucket_location)
            directory, filename = storage._get_fs()

            input_file = self.object.file.uri
            # XRootDPyFS doesn't implement root_path
            try:
                output_file = os.path.join(
                    directory.root_url + directory.base_path, filename)
            except AttributeError:
                output_file = os.path.join(directory.root_path, filename)

            try:
                # Start Sorenson
                job_id = start_encoding(input_file,
                                        output_file,
                                        preset_quality,
                                        aspect_ratio,
                                        max_height=height,
                                        max_width=width)
            except (InvalidResolutionError, TooHighResolutionError) as e:
                exception = self._meta_exception_envelope(exc=e)
                self.update_state(state=REVOKED, meta=exception)
                raise Ignore()

            # Set revoke handler, in case of an abrupt execution halt.
            self.set_revoke_handler(partial(stop_encoding, job_id))

            # Create ObjectVersionTags
            ObjectVersionTag.create(obj, 'master', self.obj_id)
            ObjectVersionTag.create(obj, '_sorenson_job_id', job_id)
            ObjectVersionTag.create(obj, 'preset_quality', preset_quality)
            ObjectVersionTag.create(obj, 'media_type', 'video')
            ObjectVersionTag.create(obj, 'context_type', 'subformat')
            preset_info = get_preset_info(aspect_ratio, preset_quality)
            for key, value in preset_info.items():
                ObjectVersionTag.create(obj, key, value)

            # Information necessary for monitoring
            job_info = dict(
                preset_quality=preset_quality,
                job_id=job_id,
                file_instance=str(file_instance.id),
                uri=output_file,
                version_id=str(obj.version_id),
                key=obj_key,
                tags=obj.get_tags(),
                percentage=0,
            )

        db.session.commit()

        self.update_state(state=STARTED,
                          meta=dict(payload=dict(**job_info),
                                    message='Started transcoding.'))

        status = ''
        # Monitor job and report accordingly
        while status != 'Finished':
            # Get job status
            status, percentage = get_encoding_status(job_id)
            if status == 'Error':
                raise RuntimeError('Error transcoding')
            job_info['percentage'] = percentage

            # Update task's state for this preset
            self.update_state(state=STARTED,
                              meta=dict(payload=dict(**job_info),
                                        message='{1} {0}'.format(
                                            status, percentage)))

            time.sleep(sleep_time)

        # Set file's location, if job has completed
        self._clean_file_name(output_file)
        with db.session.begin_nested():
            uri = output_file
            with file_opener_xrootd(uri, 'rb') as transcoded_file:
                digest = hashlib.md5(transcoded_file.read()).hexdigest()
            size = os.path.getsize(replace_xrootd(uri))
            checksum = '{0}:{1}'.format('md5', digest)
            file_instance.set_uri(uri, size, checksum)
            as_object_version(job_info['version_id']).set_file(file_instance)
        db.session.commit()
Пример #24
0
def test_fileinstance_validation(app, db, dummy_location):
    """Test validating the FileInstance."""
    f = FileInstance.create()
    f.set_uri('x' * 255, 1000, 1000)  # Should not raise
    pytest.raises(ValueError, f.set_uri, 'x' * 256, 1000, 1000)
Пример #25
0
def sips(db, locations, sip_metadata_types):
    """Fixture for the SIP objects sharing multiple files.

    Four SIPs are sharing three files in the following way:
    SIP-1: File1
    SIP-2: File1, File2
    SIP-3: File2(renamed on SIPFile, but same FileInstance), File3
    SIP-4: File4, File5, File6
    """
    sip1 = SIP.create()
    sip1api = SIPApi(sip1)
    sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>')
    sip1api.attach_metadata('json-test', '{"title": "JSON 1"}')
    # Metadata 'txt-test', although attached should not be archived
    # (see conftest configuration)
    sip1api.attach_metadata('txt-test', 'Title: TXT 1')
    file1 = FileInstance.create()
    file1.set_contents(BytesIO(b('test')),
                       default_location=locations['default'].uri)
    sip1file1 = SIPFile(sip_id=sip1.id,
                        filepath="foobar.txt",
                        file_id=file1.id)

    db_.session.add(sip1file1)

    sip2 = SIP.create()
    sip2api = SIPApi(sip2)
    sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>')
    sip2api.attach_metadata('json-test', '{"title": "JSON 2"}')
    file2 = FileInstance.create()
    file2.set_contents(BytesIO(b'test-second'),
                       default_location=locations['default'].uri)
    sip2file1 = SIPFile(sip_id=sip2.id,
                        filepath="foobar.txt",
                        file_id=file1.id)
    sip2file2 = SIPFile(sip_id=sip2.id,
                        filepath="foobar2.txt",
                        file_id=file2.id)

    db_.session.add(sip2file1)
    db_.session.add(sip2file2)

    sip3 = SIP.create()
    sip3api = SIPApi(sip3)
    sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>')
    sip3api.attach_metadata('json-test', '{"title": "JSON 3"}')
    file3 = FileInstance.create()
    file3.set_contents(BytesIO(b'test-third'),
                       default_location=locations['default'].uri)
    sip3file2 = SIPFile(sip_id=sip3.id,
                        filepath="foobar2-renamed.txt",
                        file_id=file2.id)
    sip3file3 = SIPFile(sip_id=sip3.id,
                        filepath="foobar3.txt",
                        file_id=file3.id)

    db_.session.add(sip3file2)
    db_.session.add(sip3file3)

    # A SIP with naughty filenames
    sip4 = SIP.create()
    sip4api = SIPApi(sip4)
    sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>')
    sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}')
    file4 = FileInstance.create()
    file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')),
                       default_location=locations['default'].uri)
    file5 = FileInstance.create()
    file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')),
                       default_location=locations['default'].uri)

    file6 = FileInstance.create()
    file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')),
                       default_location=locations['default'].uri)
    sip5file4 = SIPFile(sip_id=sip4.id,
                        filepath="../../foobar.txt",
                        file_id=file4.id)

    sip5file5 = SIPFile(sip_id=sip4.id,
                        filepath="http://maliciouswebsite.com/hack.js",
                        file_id=file5.id)

    sip5file6 = SIPFile(sip_id=sip4.id,
                        filepath="łóżźćąę.dat",
                        file_id=file6.id)

    db_.session.add(sip5file4)
    db_.session.add(sip5file5)
    db_.session.add(sip5file6)

    # A SIP with metadata-only changes
    sip5 = SIP.create()
    sip5api = SIPApi(sip5)
    sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>')

    db_.session.commit()
    return [sip1api, sip2api, sip3api, sip4api, sip5api]
Пример #26
0
def create_files_and_sip(deposit, dep_pid):
    """Create deposit Bucket, Files and SIPs."""
    from invenio_pidstore.errors import PIDDoesNotExistError
    from invenio_pidstore.models import PersistentIdentifier, PIDStatus
    from invenio_sipstore.errors import SIPUserDoesNotExist
    from invenio_sipstore.models import SIP, RecordSIP, SIPFile
    from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets
    from invenio_db import db
    buc = Bucket.create()
    recbuc = RecordsBuckets(record_id=deposit.id, bucket_id=buc.id)
    db.session.add(recbuc)
    deposit.setdefault('_deposit', dict())
    deposit.setdefault('_buckets', dict(deposit=str(buc.id)))
    deposit.setdefault('_files', list())
    files = deposit.get('files', [])
    sips = deposit.get('sips', [])

    # Look for prereserved DOI (and recid)
    if 'drafts' in deposit:
        drafts = list(deposit['drafts'].items())
        if len(drafts) != 1:
            logger.exception('Deposit {dep_pid} has multiple drafts'.format(
                dep_pid=dep_pid))
        if len(drafts) == 1:
            draft_type, draft = drafts[0]
            draft_v = draft['values']
            if 'prereserve_doi' in draft_v:
                pre_recid = str(draft_v['prereserve_doi']['recid'])
                pre_doi = str(draft_v['prereserve_doi']['doi'])

                # If pre-reserve info available, try to reserve 'recid'
                try:
                    pid = PersistentIdentifier.get(pid_type='recid',
                                                   pid_value=str(pre_recid))
                except PIDDoesNotExistError:
                    # Reserve recid
                    pid = PersistentIdentifier.create(
                        pid_type='recid',
                        pid_value=str(pre_recid),
                        object_type='rec',
                        status=PIDStatus.RESERVED)

                # If pre-reserve info available, try to reserve 'doi'
                try:
                    pid = PersistentIdentifier.get(pid_type='doi',
                                                   pid_value=str(pre_doi))
                except PIDDoesNotExistError:
                    # Reserve DOI
                    pid = PersistentIdentifier.create(
                        pid_type='doi',
                        pid_value=str(pre_doi),
                        object_type='rec',
                        status=PIDStatus.RESERVED)

                if RecordIdentifier.query.get(int(pre_recid)) is None:
                    RecordIdentifier.insert(int(pre_recid))

    # Store the path -> FileInstance mappings for SIPFile creation later
    dep_file_instances = list()

    for file_ in files:
        size = file_['size']
        key = file_['name']
        # Warning: Assumes all checksums are MD5!
        checksum = 'md5:{0}'.format(file_['checksum'])
        fi = FileInstance.create()
        fi.set_uri(file_['path'], size, checksum)
        ov = ObjectVersion.create(buc, key, _file_id=fi.id)
        ext = splitext(ov.key)[1].lower()
        if ext.startswith('.'):
            ext = ext[1:]
        file_meta = dict(
            bucket=str(ov.bucket.id),
            key=ov.key,
            checksum=ov.file.checksum,
            size=ov.file.size,
            version_id=str(ov.version_id),
            type=ext,
        )
        deposit['_files'].append(file_meta)
        dep_file_instances.append((file_['path'], fi))

    # Get a recid from SIP information
    recid = None
    if sips:
        recids = [int(sip['metadata']['recid']) for sip in sips]
        if len(set(recids)) > 1:
            logger.error('Multiple recids ({recids}) found in deposit {depid}'
                         ' does not exists.'.format(recids=recids,
                                                    depid=dep_pid.pid_value))
            raise DepositMultipleRecids(dep_pid.pid_value, list(set(recids)))
        elif recids:  # If only one recid
            recid = recids[0]

    for idx, sip in enumerate(sips):
        agent = None
        user_id = None
        if sip['agents']:
            agent = dict(
                ip_address=empty_str_if_none(sip['agents'][0].get(
                    'ip_address', "")),
                email=empty_str_if_none(sip['agents'][0].get(
                    'email_address', "")),
            )
            user_id = sip['agents'][0]['user_id']
        if user_id == 0:
            user_id = None
        content = sip['package']
        sip_format = 'marcxml'
        try:
            sip = SIP.create(sip_format, content, user_id=user_id, agent=agent)
        except SIPUserDoesNotExist:
            logger.exception('User ID {user_id} referred in deposit {depid} '
                             'does not exists.'.format(
                                 user_id=user_id, depid=dep_pid.pid_value))
            sip = SIP.create(sip_format, content, agent=agent)

        # Attach recid to SIP
        if recid:
            try:
                pid = PersistentIdentifier.get(pid_type='recid',
                                               pid_value=str(recid))
                record_sip = RecordSIP(sip_id=sip.id, pid_id=pid.id)
                db.session.add(record_sip)
            except PIDDoesNotExistError:
                logger.exception('Record {recid} referred in '
                                 'Deposit {depid} does not exists.'.format(
                                     recid=recid, depid=dep_pid.pid_value))
                if deposit['_p']['submitted'] == True:
                    logger.exception('Pair {recid}/{depid} was submitted,'
                                     ' (should it be unpublished?).'.format(
                                         recid=recid, depid=dep_pid.pid_value))
                else:
                    logger.exception(
                        'Pair {recid}/{depid} was not submitted.'.format(
                            recid=recid, depid=dep_pid.pid_value))

                # Reserve recid
                pid = PersistentIdentifier.create(pid_type='recid',
                                                  pid_value=str(recid),
                                                  object_type='rec',
                                                  status=PIDStatus.RESERVED)

                if RecordIdentifier.query.get(int(recid)) is None:
                    RecordIdentifier.insert(int(recid))
        if idx == 0:
            for fp, fi in dep_file_instances:
                sipf = SIPFile(sip_id=sip.id, filepath=fp, file_id=fi.id)
                db.session.add(sipf)
    deposit.commit()
    return deposit
Пример #27
0
def create_files_and_sip(deposit, dep_pid):
    """Create deposit Bucket, Files and SIPs."""
    from invenio_pidstore.errors import PIDDoesNotExistError
    from invenio_pidstore.models import PersistentIdentifier
    from invenio_sipstore.errors import SIPUserDoesNotExist
    from invenio_sipstore.models import SIP, RecordSIP, SIPFile
    from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets
    from invenio_db import db
    buc = Bucket.create()
    recbuc = RecordsBuckets(record_id=deposit.id, bucket_id=buc.id)
    db.session.add(recbuc)
    deposit.setdefault('_deposit', dict())
    deposit.setdefault('_files', list())
    files = deposit.get('files', [])
    sips = deposit.get('sips', [])
    recid = None

    if sips:
        recids = [int(sip['metadata']['recid']) for sip in sips]
        if len(set(recids)) > 1:
            logger.error('Multiple recids ({recids}) found in deposit {depid}'
                         ' does not exists.'.format(recids=recids,
                                                    depid=dep_pid.pid_value))
            raise DepositMultipleRecids(dep_pid.pid_value, list(set(recids)))
        elif recids:  # If only one recid
            recid = recids[0]

    # Store the path -> FileInstance mappings for SIPFile creation later
    dep_file_instances = list()

    for file_ in files:
        fi = FileInstance.create()
        fi.set_uri(file_['path'], file_['size'], file_['checksum'])
        ov = ObjectVersion.create(buc, file_['name'], _file_id=fi.id)
        file_meta = dict(
            bucket=str(buc.id),
            key=file_['name'],
            checksum=file_['checksum'],
            size=file_['size'],
            version_id=str(ov.version_id),
        )
        deposit['_files'].append(file_meta)
        dep_file_instances.append((file_['path'], fi))

    for idx, sip in enumerate(sips):
        agent = None
        user_id = None
        if sip['agents']:
            agent = dict(
                ip_address=sip['agents'][0].get('ip_address', ""),
                email=sip['agents'][0].get('email_address', ""),
            )
            user_id = sip['agents'][0]['user_id']
        content = sip['package']
        sip_format = 'marcxml'
        try:
            sip = SIP.create(sip_format,
                             content,
                             user_id=user_id,
                             agent=agent)
        except SIPUserDoesNotExist:
            logger.exception('User ID {user_id} referred in deposit {depid} '
                             'does not exists.'.format(
                                 user_id=user_id, depid=dep_pid.pid_value))
            raise DepositSIPUserDoesNotExist(dep_pid.pid_value, user_id)

        # If recid was found, attach it to SIP
        # TODO: This is always uses the first recid, as we quit if multiple
        # recids are found in the sips information
        if recid:
            try:
                pid = PersistentIdentifier.get(pid_type='recid',
                                               pid_value=recid)
                record_sip = RecordSIP(sip_id=sip.id, pid_id=pid.id)
                db.session.add(record_sip)
            except PIDDoesNotExistError:
                logger.exception('Record {recid} referred in '
                                 'Deposit {depid} does not exists.'.format(
                                     recid=recid, depid=dep_pid.pid_value))
                raise DepositRecidDoesNotExist(dep_pid.pid_value, recid)
        if idx == 0:
            for fp, fi in dep_file_instances:
                sipf = SIPFile(sip_id=sip.id, filepath=fp, file_id=fi.id)
                db.session.add(sipf)
    deposit.commit()
    db.session.commit()
    return deposit
Пример #28
0
def datasets(skip_files):
    """Load demo datasets records."""
    from invenio_db import db
    from invenio_records_files.api import Record
    from invenio_indexer.api import RecordIndexer
    from cernopendata.modules.records.minters.recid import \
        cernopendata_recid_minter
    from cernopendata.modules.records.minters.datasetid import \
        cernopendata_datasetid_minter

    from invenio_files_rest.models import \
        Bucket, FileInstance, ObjectVersion
    from invenio_records_files.models import RecordsBuckets

    indexer = RecordIndexer()
    schema = current_app.extensions['invenio-jsonschemas'].path_to_url(
        'records/datasets-v1.0.0.json'
    )
    data = pkg_resources.resource_filename('cernopendata',
                                           'modules/fixtures/data/datasets')
    datasets_json = glob.glob(os.path.join(data, '*.json'))

    # FIXME: change the treatment of `files` according to `records` fixtures.
    for filename in datasets_json:

        click.echo('Loading datasets from {0} ...'.format(filename))

        with open(filename, 'rb') as source:
            for data in json.load(source):
                files = data.pop('files', [])

                id = uuid.uuid4()
                # (TOFIX) Remove if statement in production
                # as every dataset record should have a doi
                if data.get('doi', None):
                    cernopendata_datasetid_minter(id, data)
                else:
                    cernopendata_recid_minter(id, data)
                data['$schema'] = schema
                record = Record.create(data, id_=id)

                bucket = Bucket.create()
                RecordsBuckets.create(
                    record=record.model, bucket=bucket)

                for file in files:
                    if skip_files:
                        break
                    assert 'uri' in file
                    assert 'size' in file
                    assert 'checksum' in file

                    f = FileInstance.create()
                    filename = file.get("uri").split('/')[-1:][0]
                    f.set_uri(file.get("uri"), file.get(
                        "size"), file.get("checksum"))

                    ObjectVersion.create(
                        bucket,
                        filename,
                        _file_id=f.id
                    )
                db.session.commit()
                indexer.index(record)
                db.session.expunge_all()
Пример #29
0
def test_fileinstance_validation(app, db, dummy_location):
    """Test validating the FileInstance."""
    f = FileInstance.create()
    f.set_uri('x' * 255, 1000, 1000)  # Should not raise
    pytest.raises(ValueError, f.set_uri, 'x' * 256, 1000, 1000)
Пример #30
0
def sips(db, locations, sip_metadata_types):
    """Fixture for the SIP objects sharing multiple files.

    Four SIPs are sharing three files in the following way:
    SIP-1: File1
    SIP-2: File1, File2
    SIP-3: File2(renamed on SIPFile, but same FileInstance), File3
    SIP-4: File4, File5, File6
    """
    # A SIP with agent info
    sip1 = SIP.create(agent={
        'email': '*****@*****.**',
        'orcid': '1111-1111-1111-1111',
        'ip_address': '1.1.1.1'
    })
    sip1api = SIPApi(sip1)
    sip1api.attach_metadata('marcxml-test', '<p>XML 1</p>')
    sip1api.attach_metadata('json-test', '{"title": "JSON 1"}')
    # Metadata 'txt-test', although attached should not be archived
    # (see conftest configuration)
    sip1api.attach_metadata('txt-test', 'Title: TXT 1')
    file1 = FileInstance.create()
    file1.set_contents(BytesIO(b('test')),
                       default_location=locations['default'].uri)
    sip1file1 = SIPFile(sip_id=sip1.id, filepath="foobar.txt",
                        file_id=file1.id)

    db_.session.add(sip1file1)

    sip2 = SIP.create()
    sip2api = SIPApi(sip2)
    sip2api.attach_metadata('marcxml-test', '<p>XML 2</p>')
    sip2api.attach_metadata('json-test', '{"title": "JSON 2"}')
    file2 = FileInstance.create()
    file2.set_contents(BytesIO(b'test-second'),
                       default_location=locations['default'].uri)
    sip2file1 = SIPFile(sip_id=sip2.id, filepath="foobar.txt",
                        file_id=file1.id)
    sip2file2 = SIPFile(sip_id=sip2.id, filepath="foobar2.txt",
                        file_id=file2.id)

    db_.session.add(sip2file1)
    db_.session.add(sip2file2)

    sip3 = SIP.create()
    sip3api = SIPApi(sip3)
    sip3api.attach_metadata('marcxml-test', '<p>XML 3</p>')
    sip3api.attach_metadata('json-test', '{"title": "JSON 3"}')
    file3 = FileInstance.create()
    file3.set_contents(BytesIO(b'test-third'),
                       default_location=locations['default'].uri)
    sip3file2 = SIPFile(sip_id=sip3.id, filepath="foobar2-renamed.txt",
                        file_id=file2.id)
    sip3file3 = SIPFile(sip_id=sip3.id, filepath="foobar3.txt",
                        file_id=file3.id)

    db_.session.add(sip3file2)
    db_.session.add(sip3file3)

    # A SIP with naughty filenames
    sip4 = SIP.create()
    sip4api = SIPApi(sip4)
    sip4api.attach_metadata('marcxml-test', '<p>XML 4 żółć</p>')
    sip4api.attach_metadata('json-test', '{"title": "JSON 4 żółć"}')
    file4 = FileInstance.create()
    file4.set_contents(BytesIO('test-fourth żółć'.encode('utf-8')),
                       default_location=locations['default'].uri)
    file5 = FileInstance.create()
    file5.set_contents(BytesIO('test-fifth ąęćźə'.encode('utf-8')),
                       default_location=locations['default'].uri)

    file6 = FileInstance.create()
    file6.set_contents(BytesIO('test-sixth π'.encode('utf-8')),
                       default_location=locations['default'].uri)
    sip5file4 = SIPFile(sip_id=sip4.id, filepath="../../foobar.txt",
                        file_id=file4.id)

    sip5file5 = SIPFile(sip_id=sip4.id,
                        filepath="http://maliciouswebsite.com/hack.js",
                        file_id=file5.id)

    sip5file6 = SIPFile(sip_id=sip4.id,
                        filepath="łóżźćąę.dat",
                        file_id=file6.id)

    db_.session.add(sip5file4)
    db_.session.add(sip5file5)
    db_.session.add(sip5file6)

    # A SIP with metadata-only changes
    sip5 = SIP.create()
    sip5api = SIPApi(sip5)
    sip5api.attach_metadata('marcxml-test', '<p>XML 5 Meta Only</p>')

    db_.session.commit()
    return [sip1api, sip2api, sip3api, sip4api, sip5api]