def _export_object(self, req, pid): """ Provide a FOXML export. """ archival = req.get_param('context') == 'archive' with get_connection() as conn, conn.cursor() as cursor: return foxml.generate_foxml(pid, archival=archival, cursor=cursor)
def purge_all(resource_ids): """ Delete the specified resources. """ connection = get_connection() for resource_id in resource_ids: cursor = connection.cursor() with connection: uri = datastream_reader.resource_uri(resource_id, cursor).fetchone()[0] try: path = resolve_uri(uri) except KeyError: path = '(non-local/unresolvable URI)' logger.debug('Unknown schema for %s.', uri) else: if not os.path.exists(path): logger.warning( 'Skipping deletion: %s (%s) does not appear to exist.', uri, path) continue logger.debug('Deleting %s (%s).', uri, path) os.remove(path) logger.debug('Deleted %s (%s).', uri, path) delete_resource(resource_id, cursor) logger.info('Resource ID %s (%s, %s) has been deleted.', resource_id, uri, path)
def _get_datastream_versions(self, pid, dsid): """ Get an iterable of datastream versions. """ with get_connection() as conn, conn.cursor() as cursor: ds_info = ds_reader.datastream_from_raw(pid, dsid, cursor=cursor).fetchone() if ds_info is None: raise DatastreamDoesNotExistError(pid, dsid) datastream_versions = [] old_dss = ds_reader.old_datastreams(ds_info['id'], cursor=cursor).fetchall() version = 0 temp_ds = ds_info.copy() # Not using enumerate as we use the version var outside the loop. for old_ds in old_dss: temp_ds.update(old_ds) temp_ds['modified'] = old_ds['committed'] datastream_versions.append( fedora_utils.datastream_to_profile(temp_ds, cursor, version=version)) version += 1 datastream_versions.append( fedora_utils.datastream_to_profile(ds_info, cursor, version=version)) return reversed(datastream_versions)
def collect(age): bootstrap() if age: age = timedelta(seconds=age) else: age = timedelta(**_config['unreferenced_age']) logger.info('Getting unreferenced objects with an age greater than %s.', age) with get_connection() as conn, conn.cursor() as cursor: cursor.execute( """ CREATE TEMPORARY TABLE garbage ON COMMIT DROP AS SELECT id FROM resource_refcounts WHERE refcount = 0 and age(now(), touched) > %s WITH DATA """, (age, )) named_cursor = conn.cursor('dgi_repo_gc', scrollable=False) named_cursor.execute('SELECT id FROM garbage') purge_all((resource_id for (resource_id, ) in named_cursor)) logger.info('Resource garbage collection complete.')
def _delete_datastream(self, req, pid, dsid): """ Purge the datastream (or range of versions). @TODO: handle logMessage when audit is dealt with. """ start = utils.iso8601_to_datetime(req.get_param('startDT')) end = utils.iso8601_to_datetime(req.get_param('endDT')) with get_connection() as conn, conn.cursor() as cursor: ds_purger.delete_datastream_versions(pid, dsid, start=start, end=end, cursor=cursor) if not cursor.rowcount: object_info = object_reader.object_id_from_raw( pid, cursor=cursor).fetchone() if object_info is None: # Only raise if the object is missing because Fedora. raise ObjectDoesNotExistError(pid) foxml.internalize_rels(pid, dsid, req.env['wsgi.identity'].source_id, cursor=cursor) return (start, end)
def _get_datastreams(self, pid, asOfDateTime=None): """ Retrieve the list of datastreams. """ with get_connection() as conn, conn.cursor() as cursor: object_info = object_reader.object_info_from_raw( pid, cursor=cursor).fetchone() try: object_id = object_info['id'] except TypeError as e: raise ObjectDoesNotExistError(pid) from e raw_datastreams = ds_reader.datastreams(object_id, cursor=cursor).fetchall() datastreams = [] for datastream in raw_datastreams: mime = ds_reader.mime_from_resource(datastream['resource'], cursor=cursor).fetchone() datastreams.append({ 'dsid': datastream['dsid'], 'label': datastream['label'], 'mimeType': mime['mime'] if mime is not None else '', }) return datastreams
def _update_datastream(self, req, pid, dsid): """ Commit the modification to the datastream. """ conn = get_connection(ISOLATION_LEVEL_READ_COMMITTED) with conn, conn.cursor() as cursor: ds_reader.datastream_from_raw(pid, dsid, cursor=cursor) ds_info = cursor.fetchone() if ds_info is None: raise DatastreamDoesNotExistError(pid, dsid) ds = dict(ds_info) ds['committed'] = ds['modified'] ds['datastream'] = ds['id'] del ds['id'] # Check modified date param, exiting if needed. modified_date = req.get_param('lastModifiedDate') if modified_date is not None: modified_date = utils.iso8601_to_datetime(modified_date) if ds['committed'] > modified_date: raise DatastreamConflictsError(pid, dsid, ds['committed'], modified_date) if ds_info['versioned']: ds_writer.upsert_old_datastream(ds, cursor=cursor) if ds['resource'] is not None: ds['mimetype'] = ds_reader.mime_from_resource( ds['resource'], cursor=cursor).fetchone()['mime'] self._upsert_ds(req, pid, dsid, cursor, ds=ds) return
def install_schema(): """ Install the application schema to the database. """ db_connection = get_connection() with db_connection: sql_file_path = join(dirname(__file__), 'resources', 'dgi_repo.sql') with open(sql_file_path, 'r') as schema_file: with db_connection.cursor() as cursor: cursor.execute(schema_file.read()) db_connection.close() logger.info('Installed schema.')
def install_base_data(): """ Install the application's base data to the database. """ db_connection = get_connection( isolation_level=ISOLATION_LEVEL_READ_COMMITTED ) with db_connection, db_connection.cursor() as cursor: for namespace, predicates in rels.RELATIONS.items(): # Default relationship data. relations_writer.upsert_namespace(namespace, cursor=cursor) namespace_id, = cursor.fetchone() for predicate in predicates: relations_writer.upsert_predicate( {'namespace': namespace_id, 'predicate': predicate}, cursor=cursor ) # Default user data. source_id = source_writer.upsert_source( _config['self']['source'], cursor=cursor ).fetchone()['id'] user_id = source_writer.upsert_user( {'source': source_id, 'name': _config['self']['username']}, cursor=cursor ).fetchone()['id'] # Default namespace data. ns_map = {} for namespace in BASE_NAMESPACES: ns_map[namespace] = object_writer.upsert_namespace( namespace, cursor=cursor ).fetchone()['id'] # Default object data. for obj in BASE_OBJECTS: namespace, pid_id = utils.break_pid(obj) obj_info = object_writer.upsert_object( { 'namespace': ns_map[namespace], 'owner': user_id, 'pid_id': pid_id, }, cursor=cursor ).fetchone() # Add DC DS as Fedora objects have DC and Islandora expects it. foxml.create_default_dc_ds(obj_info['id'], obj, cursor=cursor) db_connection.close() logger.info('Installed base data.')
def _create_datastream(self, req, pid, dsid): """ Persist the new datastream. Raises: DatastreamExistsError: The object doesn't exist. """ conn = get_connection(ISOLATION_LEVEL_READ_COMMITTED) with conn, conn.cursor() as cursor: ds_info = ds_reader.datastream_from_raw(pid, dsid, cursor=cursor).fetchone() if ds_info: raise DatastreamExistsError(pid, dsid) self._upsert_ds(req, pid, dsid, cursor)
def _get_datastream_info(self, pid, dsid, asOfDateTime=None, **kwargs): """ Get the ds* values in a dict, to build the datastream profile. """ with get_connection() as conn, conn.cursor() as cursor: ds_reader.datastream_from_raw(pid, dsid, cursor=cursor) ds_info = cursor.fetchone() if ds_info is None: raise DatastreamDoesNotExistError(pid, dsid) if asOfDateTime is not None: time = utils.iso8601_to_datetime(asOfDateTime) ds_info = ds_reader.datastream_as_of_time(ds_info['id'], time, cursor=cursor) if ds_info is None: raise DatastreamDoesNotExistError(pid, dsid, time) return fedora_utils.datastream_to_profile(ds_info, cursor)
def _update_object(self, req, pid): """ Commit the object modification. """ with get_connection() as conn, conn.cursor() as cursor: # Get current object info. object_info = object_reader.object_info_from_raw( pid, cursor=cursor).fetchone() if not object_info: raise ObjectDoesNotExistError(pid) object_info = dict(object_info) # Check modified date param, exiting if needed. modified_date = req.get_param('lastModifiedDate') if modified_date is not None: modified_date = utils.iso8601_to_datetime(modified_date) if object_info['modified'] > modified_date: raise ObjectConflictsError(pid, object_info['modified'], modified_date) # Create old version of object. if object_info['versioned']: old_object_info = object_info old_object_info['committed'] = object_info['modified'] old_object_info['object'] = object_info['id'] del old_object_info['id'] object_writer.upsert_old_object(old_object_info, cursor=cursor) cursor.fetchone() # Update object info. new_object_info = object_info new_object_info['label'] = req.get_param( 'label', default=object_info['label']) new_object_info['state'] = req.get_param( 'state', default=object_info['state']) if req.get_param('ownerId') is not None: new_object_info['owner'] = self._resolve_owner(req, cursor) if req.get_param('logMessage') is not None: new_object_info['log'] = resolve_log(req, cursor) del new_object_info['modified'] object_id = object_writer.upsert_object( new_object_info, cursor=cursor).fetchone()['id'] return object_reader.object_info( object_id, cursor=cursor).fetchone()['modified']
def _purge_object(self, req, pid): """ Purge the object. @TODO: handle logMessage when audit is dealt with. """ with get_connection() as conn, conn.cursor() as cursor: object_info = object_reader.object_info_from_raw( pid, cursor).fetchone() if object_info is None: raise ObjectDoesNotExistError(pid) if object_relation_reader.is_object_referenced( object_info['id'], cursor): raise ValueError( 'Not purging {} as it is referenced.'.format(pid)) object_purger.delete_object(object_info['id'], cursor)
def _get_ds_dissemination(self, req, pid, dsid): """ Provide datastream content. """ with get_connection() as conn, conn.cursor() as cursor: object_info = object_reader.object_id_from_raw( pid, cursor=cursor).fetchone() if object_info is None: raise ObjectDoesNotExistError(pid) time = utils.iso8601_to_datetime(req.get_param('asOfDateTime')) ds_info = ds_reader.datastream( { 'object': object_info['id'], 'dsid': dsid }, cursor=cursor).fetchone() if ds_info is None: raise DatastreamDoesNotExistError(pid, dsid) if time is not None: ds_info = ds_reader.datastream_as_of_time( ds_info['id'], time, cursor) if ds_info is None: raise DatastreamDoesNotExistError(pid, dsid, time) resource_info = ds_reader.resource(ds_info['resource']).fetchone() if resource_info is None: return None info = {} mime_info = ds_reader.mime_from_resource(resource_info['id'], cursor=cursor).fetchone() if mime_info: info['mime'] = mime_info['mime'] # Redirect if we are a redirect DS. if ds_info['control_group'] == 'R': info['location'] = resource_info['uri'] else: # Send data if we are not a redirect DS. file_path = filestore.resolve_uri(resource_info['uri']) info['stream'] = open(file_path, 'rb') return info
def _get_info(self, pid, dsid): """ Get the MIME-type and URI of the given datastream. Returns: A three-tuple comprising: - the datastream control group - the URI of the resource the datastream represents - the MIME type of the datastream's resource Raises: DatastreamDoesNotExistError: The datastream doesn't exist. """ with get_connection() as conn, conn.cursor() as cursor: datastream_info = ds_reader.datastream_from_raw( pid, dsid, cursor=cursor).fetchone() if datastream_info is None: raise DatastreamDoesNotExistError(pid, dsid) resource_info = ds_reader.resource(datastream_info['resource'], cursor=cursor).fetchone() mime_info = ds_reader.mime(resource_info['mime'], cursor=cursor).fetchone() return (datastream_info['control_group'], resource_info['uri'], mime_info['mime'])
def _get_object(self, req, pid): """ Generate the object profile XML. This does not respect asOfDateTime from Fedora. """ with get_connection() as conn, conn.cursor() as cursor: # Get object info. object_info = object_reader.object_info_from_raw( pid, cursor=cursor).fetchone() if object_info is None: raise ObjectDoesNotExistError(pid) object_relation_reader.read_relationship( relations.FEDORA_MODEL_NAMESPACE, relations.HAS_MODEL_PREDICATE, object_info['id'], cursor=cursor) model_info = cursor.fetchall() models = set() for rdf_object_info in model_info: cursor = object_reader.object_info( rdf_object_info['rdf_object'], cursor=cursor) model_object_info = cursor.fetchone() object_reader.namespace_info(model_object_info['namespace'], cursor=cursor) namespace = cursor.fetchone()['namespace'] model_pid = utils.make_pid(namespace, model_object_info['pid_id']) models.add('info:fedora/{}'.format(model_pid)) source_reader.user(object_info['owner'], cursor=cursor) owner = cursor.fetchone()['name'] return (pid, object_info['label'], models, object_info['created'], object_info['modified'], object_info['state'], owner)
def import_file(info, source, force, index, gsearch_url, gsearch_user, gsearch_password): utils.bootstrap() if index: pids = list() def _import_foxml(*args, **kwargs): pid = import_foxml(*args, **kwargs) logger.info('Ingested %s.', pid) pids.append(pid) else: def _import_foxml(*args, **kwargs): pid = import_foxml(*args, **kwargs) logger.info('Ingested %s.', pid) def scan(directory): for entry in scandir(directory): if entry.is_dir(): yield from scan(entry.path) else: yield entry def get_paths(): if os.path.isdir(info): return sorted(ent.path for ent in scan(info) if ent.is_file()) else: return [info] conn = get_connection(isolation_level=ISOLATION_LEVEL_READ_COMMITTED) savepoint = 'subtransaction' with conn, conn.cursor() as cursor: if source is None: source = upsert_source(_config['self']['source'], cursor=cursor).fetchone()['id'] for path in get_paths(): try: cursor.execute('SAVEPOINT {}'.format(savepoint)) _import_foxml(path, source, cursor=cursor) except ObjectExistsError as e: logger.warning('Object already exists "%s".', e.pid) cache.clear_cache() cursor.execute('ROLLBACK TO SAVEPOINT {}'.format(savepoint)) if force: logger.debug('Purging and reingesting %s.', e.pid) object_id_from_raw(e.pid, cursor=cursor) object_id = cursor.fetchone()[0] # Out with the old. delete_object(object_id, cursor=cursor) # In with the new. _import_foxml(path, source, cursor=cursor) finally: cursor.execute('RELEASE SAVEPOINT {}'.format(savepoint)) if index: s = requests.Session() for pid in pids: r = s.get(gsearch_url, auth=(gsearch_user, gsearch_password), params={ 'operation': 'updateIndex', 'action': 'fromPid', 'value': pid, }) if (r.status_code == requests.codes.okay and 'exception' not in r.text): logger.debug('Indexed %s.', pid) else: logger.warning('Failed to index %s.', pid)
def authenticate(identity): """ Check if the given identity is valid, and set the relevant roles. Likely used with talons.auth.external.Authenticator. Parameters: identity: An talons.auth.interfaces.Identity instance. Returns: A boolean indicating if the given identity authenticates. """ if not hasattr(identity, 'site') or identity.site is None: logger.debug('Got request without site token.') return None if identity.login == 'anonymous' and identity.key == 'anonymous': # Quick anonymous check. identity.drupal_user_id = 0 identity.roles.add('anonymous user') cursor = source_reader.source_id(identity.site) if not cursor.rowcount: sources.upsert_source(identity.site, cursor=cursor) identity.source_id = cursor.fetchone()['id'] cursor.close() logger.debug('Anonymous user logged in from %s.', identity.site) return True # Grab the config for the selected site. try: db_info = _config['drupal_sites'][identity.site]['database'] except KeyError: logger.info('Site not in configuration: %s.', identity.site) return False query = db_info['query'] if 'query' in db_info else '''SELECT DISTINCT u.uid, r.name FROM ( users u LEFT JOIN users_roles ON u.uid=users_roles.uid ) LEFT JOIN role r ON r.rid=users_roles.rid WHERE u.name=%s AND u.pass=%s''' try: # Get a DB connection and cursor for the selected site. conn = get_auth_connection(identity.site) auth_cursor = conn.cursor() # Check the credentials against the selected site (using provided # query or a default). auth_cursor.execute(query, (identity.login, identity.key)) if auth_cursor.rowcount > 0: identity.drupal_user_id = None for uid, role in auth_cursor: if identity.drupal_user_id is None: identity.drupal_user_id = uid identity.roles.add(role) identity.roles.add('authenticated user') logger.info('Authenticated %s:%s with roles: %s', identity.site, identity.login, identity.roles) with get_connection() as connection: with connection.cursor() as cursor: # Most requests won't be from new users. user_info = source_reader.source_and_user_from_raw( identity.site, identity.login, cursor=cursor ).fetchone() if user_info is not None: identity.source_id = user_info['source_id'] identity.user_id = user_info['user_id'] else: sources.upsert_source(identity.site, cursor=cursor) identity.source_id = cursor.fetchone()['id'] sources.upsert_user( {'name': identity.login, 'source': identity.source_id}, cursor=cursor ) identity.user_id = cursor.fetchone()['id'] return True else: logger.info('Failed to authenticate %s:%s.', identity.site, identity.login) return False except: logger.exception('Error while authenticating with Drupal credentials.') finally: try: auth_cursor.close() except UnboundLocalError: logger.debug('Failed before allocating DB cursor.') try: conn.close() except UnboundLocalError: logger.debug('Failed before creating DB connection.') return False
def _create_object(self, req, pid): """ Create the new object. """ conn = get_connection(ISOLATION_LEVEL_READ_COMMITTED) with conn, conn.cursor() as cursor: if not pid or pid == 'new': import_pid = None else: import_pid = pid try: # Import FOXML, getting PID. pid = foxml.import_foxml(req.get_param('file').file, req.env['wsgi.identity'].source_id, pid=import_pid, cursor=cursor) except AttributeError: if req.content_length: # Try to import FOXML from request body. pid = foxml.import_foxml( req.stream, req.env['wsgi.identity'].source_id, pid=import_pid, cursor=cursor) else: if not pid or pid == 'new': # Generate PID. raw_namespace = req.get_param( 'namespace', default=_config['default_namespace']) object_writer.get_pid_id(raw_namespace, cursor=cursor) pid_id, namespace = cursor.fetchone() pid = utils.make_pid(raw_namespace, pid_id) else: # Reserve given PID in namespace. raw_namespace, pid_id = utils.break_pid(pid) namespace = cache.repo_object_namespace_id( raw_namespace, cursor=cursor) # Jump up PIDs if needed. object_writer.jump_pids(namespace, pid_id, cursor=cursor) # Figure out the owner's DB ID. owner = self._resolve_owner(req, cursor) # Figure out the log's DB ID. log = resolve_log(req, cursor) try: object_writer.write_object( { 'namespace': namespace, 'state': req.get_param('state', default='A'), 'label': req.get_param('label'), 'log': log, 'pid_id': pid_id, 'owner': owner, }, cursor=cursor) except IntegrityError as e: raise ObjectExistsError(pid) from e foxml.create_default_dc_ds(cursor.fetchone()[0], pid, cursor=cursor) conn.close() return pid
def stash(data, destination_scheme=UPLOAD_SCHEME, mimetype='application/octet-stream'): """ Persist data, likely in our data directory. Args: data: Either a file-like object or a (byte)string to dump into a file. Please make all use of it before passing to stash as the stashed copy will not be updated. If data is a file-like object it will be closed. destination_scheme: One of URI_MAP's keys. Defaults to UPLOADED_URI. mimetype: The MIME-type of the file. Returns: The resource_id and URI of the stashed resource. """ def streamify(): """ Get the "data" as a file-like object. """ if hasattr(data, 'read'): logger.debug('Data appears file-like.') # A readable item may not have an exit, so lets read and wrap it. if not hasattr(data, '__exit__'): return BytesIO(data.read()) return data elif hasattr(data, 'encode'): logger.debug('Data appears to be an (encodable) string.') return BytesIO(data.encode()) else: logger.debug('Unknown data type: attempting to wrap in a BytesIO.') return BytesIO(data) destination = _URI_MAP[destination_scheme] connection = get_connection() with NamedTemporaryFile(delete=False, **destination) as dest: try: name = os.path.relpath(dest.name, destination['dir']) uri = '{}://{}'.format(destination_scheme, name) with streamify() as src: with connection: # XXX: This _must_ happen as a separate transaction, so we # know that the resource is tracked when it is present in # the relevant directory (and so might be garbage # collected). cursor = connection.cursor() cursor = datastream_writer.upsert_mime(mimetype, cursor) mime_id = cursor.fetchone()[0] datastream_writer.upsert_resource( { 'uri': uri, 'mime': mime_id, }, cursor=cursor) logger.debug('Stashing data as %s.', dest.name) copyfileobj(src, dest) # This is our Raison d'etre, make sure the file is out. dest.flush() os.fsync(dest.fileno()) except: logger.exception('Attempting to delete %s (%s) due to exception.', uri, dest.name) os.remove(dest.name) raise else: resource_id = cursor.fetchone()[0] logger.debug('%s got resource id %s', uri, resource_id) return resource_id, uri return