class Subscription(db.Model, IdModel, SoftDeleteModel): channel = db.Column(db.String(255), index=True) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) role = db.relationship(Role) @classmethod def find(cls, channel=None, role_id=None, deleted=False): q = cls.all(deleted=deleted) if channel is not None: q = q.filter(cls.channel == channel) if role_id is not None: q = q.filter(cls.role_id == role_id) return q.first() @classmethod def subscribe(cls, role, channel): subscription = cls.find(channel=channel, role_id=role.id) if subscription is None: subscription = cls() subscription.channel = channel subscription.role_id = role.id subscription.deleted_at = None db.session.add(subscription) return subscription @classmethod def unsubscribe(cls, role=None, channel=None, deleted_at=None): assert role is not None or channel is not None if deleted_at is None: deleted_at = datetime.utcnow() q = db.session.query(cls) if role is not None: q = q.filter(cls.role_id == role.id) if channel is not None: q = q.filter(cls.channel == channel) q.update({cls.deleted_at: deleted_at}, synchronize_session=False)
class Collection(db.Model, IdModel, SoftDeleteModel): """A set of documents and entities against which access control is enforced.""" # Category schema for collections. # TODO: should this be configurable? CATEGORIES = { 'news': lazy_gettext('News archives'), 'leak': lazy_gettext('Leaks'), 'land': lazy_gettext('Land registry'), 'gazette': lazy_gettext('Gazettes'), 'court': lazy_gettext('Court archives'), 'company': lazy_gettext('Company registries'), 'sanctions': lazy_gettext('Sanctions lists'), 'procurement': lazy_gettext('Procurement'), 'finance': lazy_gettext('Financial records'), 'grey': lazy_gettext('Grey literature'), 'library': lazy_gettext('Document libraries'), 'license': lazy_gettext('Licenses and concessions'), 'regulatory': lazy_gettext('Regulatory filings'), 'poi': lazy_gettext('Persons of interest'), 'customs': lazy_gettext('Customs declarations'), 'census': lazy_gettext('Population census'), 'transport': lazy_gettext('Air and maritime registers'), 'other': lazy_gettext('Other material') } DEFAULT = 'other' label = db.Column(db.Unicode) summary = db.Column(db.Unicode, nullable=True) category = db.Column(db.Unicode, nullable=True) countries = db.Column(ARRAY(db.Unicode()), default=[]) languages = db.Column(ARRAY(db.Unicode()), default=[]) foreign_id = db.Column(db.Unicode, unique=True, nullable=False) publisher = db.Column(db.Unicode, nullable=True) publisher_url = db.Column(db.Unicode, nullable=True) info_url = db.Column(db.Unicode, nullable=True) data_url = db.Column(db.Unicode, nullable=True) # A casefile is a type of collection which is used to manage the state # of an investigation. Unlike normal collections, cases do not serve # as source material, but as a mechanism of analysis. casefile = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) creator = db.relationship(Role) def touch(self): # https://www.youtube.com/watch?v=wv-34w8kGPM self.updated_at = datetime.utcnow() db.session.add(self) def update(self, data, authz): self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.summary = data.get('summary', self.summary) self.publisher = data.get('publisher', self.publisher) self.publisher_url = data.get('publisher_url', self.publisher_url) self.info_url = data.get('info_url', self.info_url) self.data_url = data.get('data_url', self.data_url) self.countries = ensure_list(data.get('countries', self.countries)) self.languages = ensure_list(data.get('languages', self.languages)) # Some fields are editable only by admins in order to have # a strict separation between source evidence and case # material. if authz.is_admin: self.category = data.get('category', self.category) self.casefile = as_bool(data.get('casefile'), default=self.casefile) creator = Role.by_id(data.get('creator_id')) if creator is not None: self.creator = creator self.touch() db.session.flush() if self.creator is not None: Permission.grant(self, self.creator, True, True) @property def team_id(self): role = aliased(Role) perm = aliased(Permission) q = db.session.query(role.id) q = q.filter(role.type != Role.SYSTEM) q = q.filter(role.id == perm.role_id) q = q.filter(perm.collection_id == self.id) q = q.filter(perm.read == True) # noqa q = q.filter(role.deleted_at == None) # noqa q = q.filter(perm.deleted_at == None) # noqa return [stringify(i) for (i, ) in q.all()] @property def secret(self): q = db.session.query(Permission.id) q = q.filter(Permission.role_id.in_(Role.public_roles())) q = q.filter(Permission.collection_id == self.id) q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.deleted_at == None) # noqa return q.count() < 1 @property def ns(self): if not hasattr(self, '_ns'): self._ns = Namespace(self.foreign_id) return self._ns def to_dict(self): data = self.to_dict_dates() data['category'] = self.DEFAULT if self.category in self.CATEGORIES: data['category'] = self.category data['kind'] = 'casefile' if self.casefile else 'source' data.update({ 'id': stringify(self.id), 'collection_id': stringify(self.id), 'foreign_id': self.foreign_id, 'creator_id': stringify(self.creator_id), 'team_id': self.team_id, 'label': self.label, 'summary': self.summary, 'publisher': self.publisher, 'publisher_url': self.publisher_url, 'info_url': self.info_url, 'data_url': self.data_url, 'casefile': self.casefile, 'secret': self.secret }) return data @classmethod def by_foreign_id(cls, foreign_id, deleted=False): if foreign_id is None: return q = cls.all(deleted=deleted) return q.filter(cls.foreign_id == foreign_id).first() @classmethod def _apply_authz(cls, q, authz): if authz is not None and not authz.is_admin: q = q.join(Permission, cls.id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def all_authz(cls, authz, deleted=False): q = super(Collection, cls).all(deleted=deleted) return cls._apply_authz(q, authz) @classmethod def all_by_ids(cls, ids, deleted=False, authz=None): q = super(Collection, cls).all_by_ids(ids, deleted=deleted) return cls._apply_authz(q, authz) @classmethod def create(cls, data, authz, created_at=None): foreign_id = data.get('foreign_id') or make_textid() collection = cls.by_foreign_id(foreign_id, deleted=True) if collection is None: collection = cls() collection.created_at = created_at collection.foreign_id = foreign_id collection.category = cls.DEFAULT collection.casefile = True collection.creator_id = authz.id collection.update(data, authz) collection.deleted_at = None return collection def __repr__(self): fmt = '<Collection(%r, %r, %r)>' return fmt % (self.id, self.foreign_id, self.label)
@classmethod def all_system(cls): return cls.all().filter(Role.type == Role.SYSTEM) def set_password(self, secret): """Hashes and sets the role password. :param str secret: The password to be set. """ self.password_digest = generate_password_hash(secret) def check_password(self, secret): """Checks the password if it matches the role password hash. :param str secret: The password to be checked. :rtype: bool """ digest = self.password_digest or '' return check_password_hash(digest, secret) def __repr__(self): return '<Role(%r,%r)>' % (self.id, self.foreign_id) Role.members = db.relationship(Role, secondary=membership, primaryjoin=Role.id == membership.c.group_id, secondaryjoin=Role.id == membership.c.member_id, backref="roles")
class Notification(db.Model, IdModel, DatedModel): GLOBAL = 'Global' _event = db.Column('event', db.String(255), nullable=False) channels = db.Column(ARRAY(db.String(255)), index=True) params = db.Column(JSONB) actor_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) actor = db.relationship(Role) @hybrid_property def event(self): return Events.get(self._event) @event.setter def event(self, event): self._event = event.name @property def recipients(self): q = db.session.query(Role) q = q.join(Subscription, Subscription.role_id == Role.id) q = q.filter(Subscription.channel.in_(self.channels)) q = q.filter(Role.email != None) # noqa q = q.filter(Role.deleted_at == None) # noqa q = q.filter(Subscription.deleted_at == None) # noqa q = q.distinct() return q def iterparams(self): if self.actor_id is not None: yield 'actor', Role, self.actor_id if self.event is None: return for name, clazz in self.event.params.items(): value = self.params.get(name) if value is not None: yield name, clazz, value @classmethod def publish(cls, event, actor_id=None, channels=[], params={}): notf = cls() notf.event = event notf.actor_id = actor_id notf.params = params notf.channels = list(set([c for c in channels if c is not None])) db.session.add(notf) return notf @classmethod def by_role(cls, role, since=None): columns = array_agg(Subscription.channel).label('channels') sq = db.session.query(columns) sq = sq.filter(Subscription.deleted_at == None) # noqa sq = sq.filter(Subscription.role_id == role.id) sq = sq.cte('sq') q = cls.all() q = q.filter(or_( cls.actor_id != role.id, cls.actor_id == None # noqa )) q = q.filter(cls.channels.overlap(sq.c.channels)) q = q.filter(cls._event.in_(Events.names())) if since is not None: q = q.filter(cls.created_at >= since) if role.notified_at is not None: q = q.filter(cls.created_at >= role.notified_at) q = q.order_by(cls.created_at.desc()) q = q.order_by(cls.id.desc()) return q @classmethod def by_channel(cls, channel): q = cls.all() q = q.filter(cls.channels.any(channel)) q = q.filter(cls._event.in_(Events.names())) q = q.order_by(cls.created_at.desc()) q = q.order_by(cls.id.desc()) return q
class EntitySetItem(db.Model, SoftDeleteModel): __tablename__ = "entityset_item" id = db.Column(db.Integer, primary_key=True) entityset_id = db.Column(db.String(ENTITY_ID_LEN), db.ForeignKey("entityset.id"), index=True) entity_id = db.Column(db.String(ENTITY_ID_LEN), index=True) collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True) compared_to_entity_id = db.Column(db.String(ENTITY_ID_LEN)) added_by_id = db.Column(db.Integer, db.ForeignKey("role.id")) judgement = db.Column(db.Enum(Judgement)) entityset = db.relationship(EntitySet) collection = db.relationship(Collection) added_by = db.relationship(Role) @classmethod def by_entity_id(cls, entityset, entity_id): q = cls.all() q = q.filter(cls.entityset_id == entityset.id) q = q.filter(cls.entity_id == entity_id) q = q.order_by(cls.created_at.desc()) return q.first() @classmethod def save(cls, entityset, entity_id, judgement=None, collection_id=None, **data): if judgement is None: judgement = Judgement.POSITIVE else: judgement = Judgement(judgement) existing = cls.by_entity_id(entityset, entity_id) if existing is not None: if existing.judgement == judgement: return existing existing.delete() if judgement == Judgement.NO_JUDGEMENT: return item = cls( entityset_id=entityset.id, entity_id=entity_id, judgement=judgement, compared_to_entity_id=data.get("compared_to_entity_id"), collection_id=collection_id or entityset.collection_id, added_by_id=data.get("added_by_id"), ) db.session.add(item) return item @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(EntitySet.collection_id == collection_id) pq = pq.filter(EntitySet.id == cls.entityset_id) pq.delete(synchronize_session=False) @classmethod def delete_by_entity(cls, entity_id): pq = db.session.query(cls) pq = pq.filter(cls.entity_id == entity_id) pq.delete(synchronize_session=False) def to_dict(self): data = self.to_dict_dates() data.update({ "entityset_id": self.entityset_id, "entity_id": self.entity_id, "collection_id": self.collection_id, "added_by_id": self.added_by_id, "compared_to_entity_id": self.compared_to_entity_id, }) if self.judgement: data["judgement"] = self.judgement.value return data def __repr__(self): return "<EntitySetItem(%r, %r)>" % (self.entityset_id, self.entity_id)
class Role(db.Model, IdModel, SoftDeleteModel): """A user, group or other access control subject.""" __tablename__ = 'role' USER = '******' GROUP = 'group' SYSTEM = 'system' TYPES = [USER, GROUP, SYSTEM] SYSTEM_GUEST = 'guest' SYSTEM_USER = '******' #: Generates URL-safe signatures for invitations. SIGNATURE = URLSafeTimedSerializer(settings.SECRET_KEY) #: Signature maximum age, defaults to 1 day SIGNATURE_MAX_AGE = 60 * 60 * 24 #: Password minimum length PASSWORD_MIN_LENGTH = 6 foreign_id = db.Column(db.Unicode(2048), nullable=False, unique=True) name = db.Column(db.Unicode, nullable=False) email = db.Column(db.Unicode, nullable=True) api_key = db.Column(db.Unicode, nullable=True) is_admin = db.Column(db.Boolean, nullable=False, default=False) type = db.Column(db.Enum(*TYPES, name='role_type'), nullable=False) password_digest = db.Column(db.Unicode, nullable=True) password = None reset_token = db.Column(db.Unicode, nullable=True) permissions = db.relationship('Permission', backref='role') @property def has_password(self): return self.password_digest is not None def update(self, data): self.name = data.get('name', self.name) if data.get('password'): self.set_password(data.get('password')) def clear_roles(self): """Removes any existing roles from group membership.""" self.roles = [] db.session.add(self) def add_role(self, role): """Adds an existing role as a membership of a group.""" self.roles.append(role) db.session.add(role) db.session.add(self) @classmethod def notifiable(cls): return cls.all_ids().filter(cls.email != None) # noqa @classmethod def by_foreign_id(cls, foreign_id): if foreign_id is not None: return cls.all().filter_by(foreign_id=foreign_id).first() @classmethod def by_email(cls, email): if email: return cls.all().filter_by(email=email) @classmethod def by_api_key(cls, api_key): if api_key is not None: return cls.all().filter_by(api_key=api_key).first() @classmethod def load_or_create(cls, foreign_id, type, name, email=None, is_admin=None): role = cls.by_foreign_id(foreign_id) if role is None: role = cls() role.foreign_id = foreign_id role.name = name role.type = type role.is_admin = False if role.api_key is None: role.api_key = make_textid() role.email = email if is_admin is not None: role.is_admin = is_admin # see: https://github.com/alephdata/aleph/issues/111 auto_admins = [a.lower() for a in settings.ADMINS] if email is not None and email.lower() in auto_admins: role.is_admin = True db.session.add(role) db.session.flush() return role @classmethod def load_id(cls, foreign_id, type=None, name=None): """Load a role and return the ID. If type is given and no role is found, a new role will be created. """ if not hasattr(current_app, '_authz_roles'): current_app._authz_roles = {} if foreign_id not in current_app._authz_roles: role = cls.by_foreign_id(foreign_id) if role is None: if type is None: return name = name or foreign_id role = cls.load_or_create(foreign_id, type, name) current_app._authz_roles[foreign_id] = role.id return current_app._authz_roles[foreign_id] @classmethod def public_roles(cls): """Roles which make a collection to be considered public.""" return set([ cls.load_id(cls.SYSTEM_USER), cls.load_id(cls.SYSTEM_GUEST), ]) @classmethod def by_prefix(cls, prefix): """Load a list of roles matching a name, email address, or foreign_id. :param str pattern: Pattern to match. """ q = cls.all() q = q.filter(Role.type == Role.USER) q = q.filter( or_(cls.foreign_id.ilike('%' + prefix + '%'), cls.email.ilike('%' + prefix + '%'), cls.name.ilike('%' + prefix + '%'))) return q @classmethod def all_groups(cls): return cls.all().filter(Role.type != Role.USER) def set_password(self, secret): """Hashes and sets the role password. :param str secret: The password to be set. """ self.password_digest = generate_password_hash(secret) def check_password(self, secret): """Checks the password if it matches the role password hash. :param str secret: The password to be checked. :rtype: bool """ return check_password_hash(self.password_digest or '', secret) def __repr__(self): return '<Role(%r,%r)>' % (self.id, self.foreign_id)
class Alert(db.Model, SoftDeleteModel): """A subscription to notifications on a given query.""" __tablename__ = 'alert' id = db.Column(db.Integer, primary_key=True) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) custom_label = db.Column(db.Unicode, nullable=True) query_text = db.Column(db.Unicode, nullable=True) entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), nullable=True) # noqa entity = db.relationship(Entity, backref=db.backref('alerts', lazy='dynamic')) # noqa notified_at = db.Column(db.DateTime, nullable=True) @property def label(self): if self.custom_label is not None: return self.custom_label if self.entity: return self.entity.name return self.query_text def delete(self, deleted_at=None): self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) db.session.flush() def update(self): self.notified_at = datetime.utcnow() db.session.add(self) db.session.flush() def is_same(self, other): if other.role_id == self.role_id: if other.entity_id == self.entity_id: if other.query_text == self.query_text: return True return False @classmethod def by_id(cls, id, role=None): q = cls.all().filter_by(id=id) if role is not None: q = q.filter(cls.role_id == role.id) return q.first() @classmethod def by_role(cls, role): return cls.all().filter(cls.role_id == role.id) @classmethod def create(cls, data, role): validate(data, 'alert.json#') alert = cls() alert.role_id = role.id alert.query_text = data.get('query_text') if alert.query_text is not None: alert.query_text = alert.query_text.strip() alert.query_text = alert.query_text or None alert.entity_id = data.get('entity_id') or None alert.custom_label = data.get('label') alert.update() return alert @classmethod def exists(cls, query, role): q = cls.all_ids().filter(cls.role_id == role.id) query_text = query.get('q') if query_text is not None: query_text = query_text.strip() if not len(query_text): query_text = None q = q.filter(cls.query_text == query_text) entities = query.getlist('entity') if len(entities) == 1: q = q.filter(cls.entity_id == entities[0]) else: q = q.filter(cls.entity_id == None) # noqa q = q.limit(1) return q.scalar() @classmethod def dedupe(cls, entity_id): alerts = cls.all().filter_by(entity_id=entity_id).all() for left in alerts: for right in alerts: if left.id >= right.id: continue if left.is_same(right): left.delete() def __repr__(self): return '<Alert(%r, %r)>' % (self.id, self.label) def to_query(self): return MultiDict({ 'q': self.query_text or '', 'entity': self.entity_id }) def to_dict(self): return { 'id': self.id, 'label': self.label, 'role_id': self.role_id, 'query_text': self.query_text, 'entity_id': self.entity_id, 'created_at': self.created_at, 'notified_at': self.notified_at, 'updated_at': self.updated_at }
class List(db.Model): id = db.Column(db.Integer(), primary_key=True) label = db.Column(db.Unicode) public = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer(), db.ForeignKey('user.id'), nullable=True) creator = db.relationship(User) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) users = db.relationship(User, secondary=list_user_table, backref='lists') def to_dict(self): return { 'id': self.id, 'api_url': url_for('lists.view', id=self.id), 'entities_api_url': url_for('entities.index', list=self.id), 'label': self.label, 'public': self.public, 'creator_id': self.creator_id, 'created_at': self.created_at, 'updated_at': self.updated_at } @classmethod def create(cls, data, user): lst = cls() lst.update(data, user) lst.creator = user db.session.add(lst) return lst def update(self, data, user): data = ListForm().deserialize(data) self.label = data.get('label') if data.get('public') is not None: self.public = data.get('public') users = set(data.get('users', [])) if user is not None: users.add(user) self.users = list(users) def delete(self): # for entity in self.entities: # entity.delete() db.session.delete(self) @classmethod def by_label(cls, label): q = db.session.query(cls).filter_by(label=label) return q.first() @classmethod def by_id(cls, id): q = db.session.query(cls).filter_by(id=id) return q.first() @classmethod def user_list_ids(cls, user=None, include_public=True): logged_in = user is not None and user.is_authenticated() q = db.session.query(cls.id) conds = [] if include_public: conds.append(cls.public == True) # noqa if logged_in: conds.append(cls.users.any(User.id == user.id)) if not len(conds): return [] if not (logged_in and user.is_admin): q = q.filter(or_(*conds)) return [c.id for c in q.all()] @classmethod def all_by_user(cls, user): q = db.session.query(cls) q = q.filter(cls.id.in_(cls.user_list_ids(user))) q = q.order_by(cls.id.desc()) return q @property def terms(self): from aleph.model.entity import Entity from aleph.model.selector import Selector q = db.session.query(Selector.normalized) q = q.join(Entity, Entity.id == Selector.entity_id) q = q.filter(Entity.list_id == self.id) q = q.distinct() return set([r[0] for r in q]) def __repr__(self): return '<List(%r, %r)>' % (self.id, self.label) def __unicode__(self): return self.label
class Mapping(db.Model, DatedModel): """A mapping to load entities from a table""" __tablename__ = "mapping" FAILED = "failed" SUCCESS = "success" PENDING = "pending" STATUS = { SUCCESS: lazy_gettext("success"), FAILED: lazy_gettext("failed"), PENDING: lazy_gettext("pending"), } id = db.Column(db.Integer, primary_key=True) query = db.Column("query", JSONB) role_id = db.Column(db.Integer, db.ForeignKey("role.id"), index=True) role = db.relationship(Role, backref=db.backref("mappings", lazy="dynamic")) # noqa collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True) collection = db.relationship(Collection, backref=db.backref("mappings", lazy="dynamic")) table_id = db.Column(db.String(ENTITY_ID_LEN), index=True) disabled = db.Column(db.Boolean, nullable=True) last_run_status = db.Column(db.Unicode, nullable=True) last_run_err_msg = db.Column(db.Unicode, nullable=True) def get_proxy_context(self): """Metadata to be added to each generated entity.""" return { "created_at": iso_text(self.created_at), "updated_at": iso_text(self.updated_at), "role_id": self.role_id, "mutable": True, } def update(self, query=None, table_id=None): self.updated_at = datetime.utcnow() if query: self.query = query if table_id: self.table_id = table_id db.session.add(self) def set_status(self, status, error=None): self.last_run_status = status self.last_run_err_msg = error db.session.add(self) def to_dict(self): data = self.to_dict_dates() status = self.STATUS.get(self.last_run_status) data.update({ "id": stringify(self.id), "query": dict(self.query), "role_id": stringify(self.role_id), "collection_id": stringify(self.collection_id), "table_id": self.table_id, "last_run_status": status, "last_run_err_msg": self.last_run_err_msg, }) return data @classmethod def by_collection(cls, collection_id, table_id=None): q = cls.all().filter(cls.collection_id == collection_id) if table_id is not None: q = q.filter(cls.table_id == table_id) return q @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) @classmethod def delete_by_table(cls, entity_id): pq = db.session.query(cls) pq = pq.filter(cls.table_id == entity_id) pq.delete(synchronize_session=False) @classmethod def create(cls, query, table_id, collection, role_id): mapping = cls() mapping.role_id = role_id mapping.query = query mapping.collection_id = collection.id mapping.table_id = table_id mapping.update() return mapping def __repr__(self): return "<Mapping(%r, %r)>" % (self.id, self.table_id)
class Document(db.Model, DatedModel): _schema = 'document.json#' TYPE_TEXT = 'text' TYPE_TABULAR = 'tabular' TYPE_OTHER = 'other' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=False, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True) type = db.Column(db.Unicode(10), nullable=False, index=True) _meta = db.Column('meta', JSONB) collections = db.relationship( Collection, secondary=collection_document_table, # noqa backref=db.backref('documents', lazy='dynamic')) # noqa source_collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), nullable=True) # noqa source_collection = db.relationship(Collection) @property def title(self): return self.meta.title @hybrid_property def meta(self): self._meta = self._meta or {} self._meta['content_hash'] = self.content_hash self._meta['foreign_id'] = self.foreign_id return Metadata.from_data(self._meta or {}) @meta.setter def meta(self, meta): if isinstance(meta, Metadata): self.content_hash = meta.content_hash self.foreign_id = meta.foreign_id meta = meta.to_attr_dict() self._meta = meta flag_modified(self, '_meta') def update(self, data, writeable): validate(data, self._schema) collection_id = data.pop('collection_id', []) self.update_collections(collection_id, writeable) meta = self.meta meta.update(data, safe=True) self.meta = meta db.session.add(self) def update_collections(self, collection_id, writeable): for coll in self.collections: if coll.id == self.source_collection_id: continue if coll.id not in collection_id and coll.id in writeable: self.collections.remove(coll) for coll_id in collection_id: if coll_id in writeable: coll = Collection.by_id(coll_id) if coll not in self.collections: self.collections.append(coll) db.session.add(self) def delete_pages(self): pq = db.session.query(DocumentPage) pq = pq.filter(DocumentPage.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_records(self): pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.document_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() self.delete_records() self.delete_pages() db.session.delete(self) def insert_records(self, sheet, iterable, chunk_size=1000): chunk = [] for i, data in enumerate(iterable): chunk.append({ 'document_id': self.id, 'row_id': i, 'sheet': sheet, 'data': data }) if len(chunk) >= chunk_size: db.session.bulk_insert_mappings(DocumentRecord, chunk) chunk = [] if len(chunk): db.session.bulk_insert_mappings(DocumentRecord, chunk) def text_parts(self): if self.type == self.TYPE_TEXT: for page in self.pages: for text in page.text_parts(): yield text elif self.type == self.TYPE_TABULAR: for record in self.records: for text in record.text_parts(): yield text @classmethod def get_max_id(cls): q = db.session.query(func.max(cls.id)) return q.scalar() def __repr__(self): return '<Document(%r,%r,%r)>' % (self.id, self.type, self.meta.title) @property def collection_ids(self): collection_ids = [c.id for c in self.collections] if self.source_collection_id not in collection_ids: if self.source_collection_id is not None: collection_ids.append(self.source_collection_id) return collection_ids def _add_to_dict(self, data): collection_ids = self.collection_ids try: from aleph.authz import collections_public data['public'] = collections_public(collection_ids) except: pass data.update({ 'id': self.id, 'type': self.type, 'source_collection_id': self.source_collection_id, 'collection_id': collection_ids, 'created_at': self.created_at, 'updated_at': self.updated_at }) return data def to_dict(self): data = self.meta.to_dict() return self._add_to_dict(data) def to_index_dict(self): data = self.meta.to_index_dict() return self._add_to_dict(data)
class Entity(db.Model, UuidModel, SoftDeleteModel, SchemaModel): _schema = '/entity/entity.json#' _schema_recurse = True STATE_ACTIVE = 'active' STATE_PENDING = 'pending' STATE_DELETED = 'deleted' name = db.Column(db.Unicode) type = db.Column('type', db.String(255), index=True) state = db.Column(db.String(128), nullable=True, default=STATE_ACTIVE) summary = db.Column(db.Unicode, nullable=True) description = db.Column(db.Unicode, nullable=True) jurisdiction_code = db.Column(db.Unicode, nullable=True) register_name = db.Column(db.Unicode, nullable=True) register_url = db.Column(db.Unicode, nullable=True) __mapper_args__ = {'polymorphic_on': type, 'polymorphic_identity': _schema} collections = db.relationship( Collection, secondary=collection_entity_table, # noqa backref=db.backref('entities', lazy='dynamic')) # noqa def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.entity_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) self.state = self.STATE_DELETED super(Entity, self).delete(deleted_at=deleted_at) def update(self, data, merge=False): self.schema_update(data, merge=merge) def merge(self, other): if self.id == other.id: return # De-dupe todo: # 1. merge identifiers # 2. merge properties # 3. merge names, make merged names into a.k.a's # 4. merge collections # 5. update references # 6. update alerts # 7. delete source entities # 8. update source entities # 9. update target entity collections = list(self.collections) for collection in other.collections: if collection not in collections: self.collections.append(collection) if self.name.lower() != other.name.lower(): aka = EntityOtherName() aka.update({'name': other.name}) aka.entity = self db.session.add(aka) from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({'entity_id': self.id}) from aleph.model.reference import Reference q = db.session.query(Reference).filter(Reference.entity_id == other.id) q.update({'entity_id': self.id}) db.session.commit() db.session.refresh(other) self.schema_merge(other) def schema_merge(self, other): """Attempt to merge other onto self via JSON schema.""" # TODO: figure out if we want to change schema for prop in self.schema_visitor.properties: if prop.name == 'id': continue self_value = getattr(self, prop.name) if \ hasattr(self, prop.name) else None other_value = getattr(other, prop.name) if \ hasattr(other, prop.name) else None if self_value is None and other_value is None: continue if prop.is_value and self_value is None: # update local properties setattr(self, prop.name, other_value) elif prop.is_object and self._schema_recurse: # update associated objects which are not set on the # existing object. rel = self._get_relationship(prop.name, 'MANYTOONE') if self_value is not None or other_value is None: continue data = other_value.to_dict() obj = type(other_value)() obj.update(data) for local, remote in self._get_associations(obj, rel): other_id = getattr(obj, remote) setattr(self, local, other_id) elif prop.is_array and self._schema_recurse \ and other_value is not None: # merge array associations rel = self._get_relationship(prop.name, 'ONETOMANY') full_list = list(self_value) for new_item in other_value: data = new_item.to_dict() existing = [o for o in full_list if o.merge_compare(data)] if len(existing): continue obj = type(new_item)() obj.update(data) for local, remote in self._get_associations(obj, rel): setattr(obj, remote, getattr(self, local)) db.session.add(obj) full_list.append(obj) self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() other.delete() db.session.flush() @classmethod def save(cls, data, collections, merge=False): ent = cls.by_id(data.get('id')) if 'state' not in data: data['state'] = cls.STATE_ACTIVE for identifier in data.get('identifiers', []): if ent is None: ent = cls.by_identifier(identifier.get('scheme'), identifier.get('identifier'), collections=collections) if ent is None: schema = data.get('$schema', cls._schema) cls = cls.get_schema_class(schema) ent = cls() ent.id = make_textid() if merge: for collection in ent.collections: if collection.id not in [c.id for c in collections]: collections.append(collection) if not len(collections): raise AttributeError("No collection specified.") ent.collections = collections ent.update(data, merge=merge) return ent @classmethod def filter_collections(cls, q, collections=None): if collections is None: return q collection_ids = [] for collection in collections: if isinstance(collection, Collection): collection = collection.id collection_ids.append(collection) coll = aliased(Collection) q = q.join(coll, Entity.collections) q = q.filter(coll.id.in_(collection_ids)) q = q.filter(coll.deleted_at == None) # noqa return q @classmethod def by_identifier(cls, scheme, identifier, collections=None): q = db.session.query(Entity) q = q.filter(Entity.deleted_at == None) # noqa q = cls.filter_collections(q, collections=collections) ident = aliased(EntityIdentifier) q = q.join(ident, Entity.identifiers) q = q.filter(ident.deleted_at == None) # noqa q = q.filter(ident.scheme == scheme) q = q.filter(ident.identifier == identifier) return q.first() @classmethod def by_id_set(cls, ids, collections=None): if not len(ids): return {} q = cls.all() q = cls.filter_collections(q, collections=collections) q = q.options(joinedload('collections')) q = q.filter(cls.id.in_(ids)) entities = {} for ent in q: entities[ent.id] = ent return entities @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.state == cls.STATE_ACTIVE) return q.scalar() @classmethod def all_by_document(cls, document_id): from aleph.model.reference import Reference q = cls.all() q = q.options(joinedload('collections')) q = q.filter(cls.state == cls.STATE_ACTIVE) q = q.join(Reference) q = q.filter(Reference.document_id == document_id) return q.distinct() @property def fingerprint(self): return make_fingerprint(self.name) @property def terms(self): terms = set([self.name]) for other_name in self.other_names: terms.update(other_name.terms) return [t for t in terms if t is not None and len(t)] @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = [' %s ' % normalize_strong(t) for t in self.terms] regex_terms = set() for term in terms: if len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other == term: continue if other in term: contained = True if not contained: regex_terms.add(term.strip()) return regex_terms def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name) def __unicode__(self): return self.name def to_dict(self): data = super(Entity, self).to_dict() data['collection_id'] = [c.id for c in self.collections] return data def to_ref(self): return { 'id': self.id, 'name': self.name, '$schema': self.type, 'collection_id': [c.id for c in self.collections] }
class User(db.Model): id = db.Column(db.Integer, primary_key=True) email = db.Column( db.Unicode, #following attributes are for flask-user nullable=False, unique=True) display_name = db.Column(db.Unicode, nullable=True) active = db.Column(db.Boolean, nullable=False, default=True) # Aleph-specific columns is_admin = db.Column(db.Boolean, nullable=False, default=False) twitter_id = db.Column(db.Unicode) facebook_id = db.Column(db.Unicode) api_key = db.Column(db.Unicode, default=make_token) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) # Columns required for flask-user confirmed_at = db.Column(db.DateTime()) password = db.Column(db.String(255), nullable=False, server_default='') reset_password_token = db.Column(db.String(100), nullable=False, server_default='') # 'active' already defined above # omitting first and last name # Relationships roles = db.relationship('Role', secondary='roles_users', backref=db.backref('user', lazy='dynamic')) def is_active(self): return self.active def is_authenticated(self): return True def is_anonymous(self): return False def get_id(self): return unicode(self.id) def __repr__(self): return '<User(%r,%r)>' % (self.id, self.email) def __unicode__(self): return self.display_name def to_dict(self): return { 'id': self.id, 'api_url': url_for('users.view', id=self.id), 'email': self.email, 'display_name': self.display_name } def update(self, data): data = UserForm().deserialize(data) self.display_name = data.get('display_name') self.email = data.get('email') @classmethod def load(cls, data): user = None if 'twitter_id' in data: user = cls.by_twitter_id(data.get('twitter_id')) elif 'facebook_id' in data: user = cls.by_facebook_id(data.get('facebook_id')) if user is None: user = cls() user.twitter_id = data.get('twitter_id') user.facebook_id = data.get('facebook_id') if not user.display_name: user.display_name = data.get('display_name') if not user.email: user.email = data.get('email') db.session.add(user) return user def check_pw(self, pw): return self.password == get_hmac(pw) @classmethod def all(cls): q = db.session.query(cls).filter_by(active=True) return q @classmethod def by_id(cls, id): q = db.session.query(cls).filter_by(id=int(id)) return q.first() @classmethod def by_api_key(cls, api_key): q = db.session.query(cls).filter_by(api_key=api_key) return q.first() @classmethod def by_twitter_id(cls, twitter_id): q = db.session.query(cls).filter_by(twitter_id=str(twitter_id)) return q.first() @classmethod def by_facebook_id(cls, facebook_id): q = db.session.query(cls).filter_by(facebook_id=str(facebook_id)) return q.first() @classmethod def by_email(cls, email): q = db.session.query(cls).filter_by(email=email) return q.first() @classmethod def create_by_email(cls, email, pw): src = cls(email=email, password=get_hmac(pw)) db.session.add(src) db.session.commit() return src
class Document(db.Model, DatedModel, Metadata): MAX_TAGS = 10000 SCHEMA = 'Document' SCHEMA_FOLDER = 'Folder' SCHEMA_PACKAGE = 'Package' SCHEMA_WORKBOOK = 'Workbook' SCHEMA_TEXT = 'PlainText' SCHEMA_HTML = 'HyperText' SCHEMA_PDF = 'Pages' SCHEMA_IMAGE = 'Image' SCHEMA_AUDIO = 'Audio' SCHEMA_VIDEO = 'Video' SCHEMA_TABLE = 'Table' SCHEMA_EMAIL = 'Email' STATUS_PENDING = 'pending' STATUS_SUCCESS = 'success' STATUS_FAIL = 'fail' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=True, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True, index=True) schema = db.Column(db.String(255), nullable=False) status = db.Column(db.Unicode(10), nullable=True) meta = db.Column(JSONB, default={}) error_message = db.Column(db.Unicode(), nullable=True) body_text = db.Column(db.Unicode(), nullable=True) body_raw = db.Column(db.Unicode(), nullable=True) uploader_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) # noqa parent_id = db.Column(db.BigInteger, db.ForeignKey('document.id'), nullable=True, index=True) # noqa children = db.relationship('Document', lazy='dynamic', backref=db.backref('parent', uselist=False, remote_side=[id])) # noqa collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), nullable=False, index=True) # noqa collection = db.relationship(Collection, backref=db.backref('documents', lazy='dynamic')) # noqa def __init__(self, **kw): self.meta = {} super(Document, self).__init__(**kw) @property def model(self): return model.get(self.schema) @property def name(self): if self.title is not None: return self.title if self.file_name is not None: return self.file_name if self.source_url is not None: return self.source_url @property def supports_records(self): # Slightly unintuitive naming: this just checks the document type, # not if there actually are any records. return self.schema in [self.SCHEMA_PDF, self.SCHEMA_TABLE] @property def supports_pages(self): return self.schema == self.SCHEMA_PDF @property def supports_nlp(self): structural = [ Document.SCHEMA, Document.SCHEMA_PACKAGE, Document.SCHEMA_FOLDER, Document.SCHEMA_WORKBOOK, Document.SCHEMA_VIDEO, Document.SCHEMA_AUDIO, ] return self.schema not in structural @property def ancestors(self): if self.parent_id is None: return [] key = cache.key('ancestors', self.id) ancestors = cache.get_list(key) if ancestors is not None: return ancestors ancestors = self.parent.ancestors ancestors.append(self.parent_id) cache.set_list(key, ancestors) return ancestors def update(self, data): props = ('title', 'summary', 'author', 'crawler', 'source_url', 'file_name', 'mime_type', 'headers', 'date', 'authored_at', 'modified_at', 'published_at', 'retrieved_at', 'languages', 'countries', 'keywords') for prop in props: value = data.get(prop, self.meta.get(prop)) setattr(self, prop, value) db.session.add(self) def update_meta(self): flag_modified(self, 'meta') def delete_records(self): pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id == self.id) pq.delete() db.session.flush() def delete_tags(self): pq = db.session.query(DocumentTag) pq = pq.filter(DocumentTag.document_id == self.id) pq.delete() db.session.flush() def delete(self, deleted_at=None): self.delete_records() self.delete_tags() db.session.delete(self) @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): documents = db.session.query(cls.id) documents = documents.filter(cls.collection_id == collection_id) documents = documents.subquery() pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id.in_(documents)) pq.delete(synchronize_session=False) pq = db.session.query(DocumentTag) pq = pq.filter(DocumentTag.document_id.in_(documents)) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) def raw_texts(self): yield self.title yield self.file_name yield self.source_url yield self.summary yield self.author if self.status != self.STATUS_SUCCESS: return yield self.body_text if self.supports_records: # iterate over all the associated records. pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id == self.id) pq = pq.order_by(DocumentRecord.index.asc()) for record in pq.yield_per(10000): yield from record.raw_texts() @property def texts(self): yield from filter_texts(self.raw_texts()) @classmethod def by_keys(cls, parent_id=None, collection_id=None, foreign_id=None, content_hash=None): """Try and find a document by various criteria.""" q = cls.all() q = q.filter(Document.collection_id == collection_id) if parent_id is not None: q = q.filter(Document.parent_id == parent_id) if foreign_id is not None: q = q.filter(Document.foreign_id == foreign_id) elif content_hash is not None: q = q.filter(Document.content_hash == content_hash) else: raise ValueError("No unique criterion for document.") document = q.first() if document is None: document = cls() document.schema = cls.SCHEMA document.collection_id = collection_id if parent_id is not None: document.parent_id = parent_id if foreign_id is not None: document.foreign_id = foreign_id if content_hash is not None: document.content_hash = content_hash db.session.add(document) return document @classmethod def by_id(cls, id, collection_id=None): if id is None: return q = cls.all() q = q.filter(cls.id == id) if collection_id is not None: q = q.filter(cls.collection_id == collection_id) return q.first() @classmethod def by_collection(cls, collection_id=None): q = cls.all() q = q.filter(cls.collection_id == collection_id) return q @classmethod def find_ids(cls, collection_id=None, failed_only=False): q = cls.all_ids() if collection_id is not None: q = q.filter(cls.collection_id == collection_id) if failed_only: q = q.filter(cls.status != cls.STATUS_SUCCESS) q = q.order_by(cls.id.asc()) return q def to_proxy(self): meta = dict(self.meta) headers = meta.pop('headers', {}) headers = {slugify(k, sep='_'): v for k, v in headers.items()} proxy = model.get_proxy({ 'id': str(self.id), 'schema': self.model, 'properties': meta }) proxy.set('contentHash', self.content_hash) proxy.set('parent', self.parent_id) proxy.set('ancestors', self.ancestors) proxy.set('fileSize', meta.get('file_size')) proxy.set('fileName', meta.get('file_name')) if not proxy.has('fileName'): disposition = headers.get('content_disposition') if disposition is not None: _, attrs = cgi.parse_header(disposition) proxy.set('fileName', attrs.get('filename')) proxy.set('mimeType', meta.get('mime_type')) if not proxy.has('mimeType'): proxy.set('mimeType', headers.get('content_type')) proxy.set('language', meta.get('languages')) proxy.set('country', meta.get('countries')) proxy.set('authoredAt', meta.get('authored_at')) proxy.set('modifiedAt', meta.get('modified_at')) proxy.set('publishedAt', meta.get('published_at')) proxy.set('retrievedAt', meta.get('retrieved_at')) proxy.set('sourceUrl', meta.get('source_url')) proxy.set('messageId', meta.get('message_id'), quiet=True) proxy.set('inReplyTo', meta.get('in_reply_to'), quiet=True) proxy.set('bodyText', self.body_text, quiet=True) proxy.set('bodyHtml', self.body_raw, quiet=True) columns = meta.get('columns') proxy.set('columns', registry.json.pack(columns), quiet=True) proxy.set('headers', registry.json.pack(headers), quiet=True) pdf = 'application/pdf' if meta.get('extension') == 'pdf' or proxy.first('mimeType') == pdf: proxy.set('pdfHash', self.content_hash, quiet=True) proxy.add('pdfHash', meta.get('pdf_version'), quiet=True) q = db.session.query(DocumentTag) q = q.filter(DocumentTag.document_id == self.id) q = q.filter(DocumentTag.type.in_(DocumentTag.MAPPING.keys())) q = q.order_by(DocumentTag.weight.desc()) q = q.limit(Document.MAX_TAGS) for tag in q.all(): prop = DocumentTag.MAPPING.get(tag.type) if prop is not None: proxy.add(prop, tag.text) return proxy def to_dict(self): proxy = self.to_proxy() data = proxy.to_full_dict() data.update(self.to_dict_dates()) data.update({ 'name': self.name, 'status': self.status, 'foreign_id': self.foreign_id, 'document_id': self.id, 'collection_id': self.collection_id, 'error_message': self.error_message, 'uploader_id': self.uploader_id, 'bulk': False, }) return data def __repr__(self): return '<Document(%r,%r,%r)>' % (self.id, self.schema, self.title)
class Export(db.Model, IdModel, DatedModel): """A data export run in the background. The data is stored in a cloud storage bucket and the user is given a link to download the data. The link expires after a fixed duration and the exported data is deleted. """ MAX_FILE_SIZE = 10 * 1024 * 1024 * 1024 # 10 GB STATUS_PENDING = "pending" STATUS_SUCCESSFUL = "successful" STATUS_FAILED = "failed" EXPORT_STATUS = { STATUS_PENDING: lazy_gettext("pending"), STATUS_SUCCESSFUL: lazy_gettext("successful"), STATUS_FAILED: lazy_gettext("failed"), } DEFAULT_STATUS = STATUS_PENDING DEFAULT_EXPIRATION = timedelta(days=30) # After 30 days label = db.Column(db.Unicode) operation = db.Column(db.Unicode) creator_id = db.Column(db.Integer, db.ForeignKey("role.id")) creator = db.relationship(Role, backref=db.backref("exports", lazy="dynamic")) collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True, nullable=True) collection = db.relationship(Collection, backref=db.backref("exports", lazy="dynamic")) expires_at = db.Column(db.DateTime, default=None, nullable=True) deleted = db.Column(db.Boolean, default=False) export_status = db.Column(db.Unicode, default=DEFAULT_STATUS) content_hash = db.Column(db.Unicode(65), index=True, nullable=True) file_size = db.Column(db.BigInteger, nullable=True) # In bytes file_name = db.Column(db.Unicode, nullable=True) mime_type = db.Column(db.Unicode) meta = db.Column(JSONB, default={}) def to_dict(self): data = self.to_dict_dates() if self.export_status in self.EXPORT_STATUS: data["export_status"] = self.EXPORT_STATUS.get(self.export_status) data.update({ "id": stringify(self.id), "label": self.label, "operation": self.operation, "creator_id": stringify(self.creator_id), "collection_id": self.collection_id, "expires_at": self.expires_at, "deleted": self.deleted, "export_status": self.export_status, "content_hash": self.content_hash, "file_size": self.file_size, "file_name": self.file_name, "meta": self.meta, }) return data @classmethod def create( cls, operation, role_id, label, file_path=None, expires_after=None, collection=None, mime_type=None, ): export = cls() export.creator_id = role_id export.operation = operation export.label = label if file_path is not None: export.set_filepath(file_path) if collection is not None: export.collection_id = collection.id export.mime_type = mime_type export.expires_at = datetime.utcnow() + (expires_after or cls.DEFAULT_EXPIRATION) db.session.add(export) return export @property def namespace(self): return make_key("role", self.creator_id) def publish(self): if not self._file_path: raise RuntimeError("file path not present for export: %r", self) # Use contenthash as filename to make to ensure uniqueness path = Path(self._file_path.parent, self.content_hash) self._file_path.rename(path) try: archive.publish(self.namespace, path, self.mime_type) self.set_status(status=Export.STATUS_SUCCESSFUL) except Exception as ex: self.set_status(status=Export.STATUS_FAILED) raise ex def set_filepath(self, file_path): file_path = ensure_path(file_path) file_name = safe_filename(file_path) file_size = file_path.stat().st_size self.file_name = file_name self.file_size = file_size self._file_path = file_path self.content_hash = checksum(file_path) def set_status(self, status): if status in self.EXPORT_STATUS: self.export_status = status db.session.add(self) def delete_publication(self): if self._should_delete_publication(): archive.delete_publication(self.namespace, self.content_hash) self.deleted = True db.session.add(self) def _should_delete_publication(self): """Check whether the published export should be deleted from the archive Since we store exports by contenthash, there may be other non-expired exports that point to the same file in the archive""" q = (Export.all().filter( Export.content_hash == self.content_hash).filter( Export.deleted.isnot(True)).filter(Export.id != self.id)) return q.first() is None @classmethod def get_expired(cls, deleted=False): now = datetime.utcnow() q = cls.all().filter( cls.expires_at.isnot(None)).filter(cls.expires_at <= now) if deleted is not None: q = q.filter(cls.deleted == deleted) return q @classmethod def by_id(cls, id, role_id=None, deleted=False): q = cls.all().filter_by(id=id) if role_id is not None: q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) return q.first() @classmethod def by_role_id(cls, role_id, deleted=False): q = cls.all() q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) q = q.order_by(cls.created_at.desc()) return q def __repr__(self): return "<Export(%r, %r)>" % (self.id, self.creator_id)
class Role(db.Model, IdModel, SoftDeleteModel): """A user, group or other access control subject.""" _schema = 'role.json#' __tablename__ = 'role' USER = '******' GROUP = 'group' SYSTEM = 'system' TYPES = [USER, GROUP, SYSTEM] SYSTEM_GUEST = 'guest' SYSTEM_USER = '******' foreign_id = db.Column(db.Unicode(2048), nullable=False, unique=True) name = db.Column(db.Unicode, nullable=False) email = db.Column(db.Unicode, nullable=True) api_key = db.Column(db.Unicode, nullable=True) is_admin = db.Column(db.Boolean, nullable=False, default=False) type = db.Column(db.Enum(*TYPES, name='role_type'), nullable=False) permissions = db.relationship("Permission", backref="role") def update(self, data): validate(data, self._schema) self.name = data.get('name', self.name) self.email = data.get('email', self.email) def clear_roles(self): self.roles = [] db.session.add(self) def add_role(self, role): self.roles.append(role) db.session.add(role) db.session.add(self) @classmethod def notifiable(cls): return cls.all_ids().filter(cls.email != None) # noqa @classmethod def by_foreign_id(cls, foreign_id): if foreign_id is not None: return cls.all().filter_by(foreign_id=foreign_id).first() @classmethod def by_api_key(cls, api_key): if api_key is not None: return cls.all().filter_by(api_key=api_key).first() @classmethod def load_or_create(cls, foreign_id, type, name, email=None, is_admin=None): role = cls.by_foreign_id(foreign_id) if role is None: role = cls() role.foreign_id = foreign_id role.name = name role.type = type role.is_admin = False if role.api_key is None: role.api_key = uuid4().hex role.email = email if is_admin is not None: role.is_admin = is_admin # see: https://github.com/pudo/aleph/issues/111 auto_admins = get_config('AUTHZ_ADMINS') or '' auto_admins = [a.lower() for a in auto_admins.split(',')] if email is not None and email.lower() in auto_admins: role.is_admin = True db.session.add(role) db.session.flush() return role @classmethod def load_id(cls, foreign_id, type=None, name=None): """Load a role and return the ID. If type is given and no role is found, a new role will be created. """ if not hasattr(current_app, '_authz_roles'): current_app._authz_roles = {} if foreign_id not in current_app._authz_roles: role = cls.by_foreign_id(foreign_id) if role is None: if type is None: return name = name or foreign_id role = cls.load_or_create(foreign_id, type, name) current_app._authz_roles[foreign_id] = role.id return current_app._authz_roles[foreign_id] def __repr__(self): return '<Role(%r,%r)>' % (self.id, self.foreign_id) def __unicode__(self): return self.name def to_dict(self): data = super(Role, self).to_dict() data.update({ 'api_url': url_for('roles_api.view', id=self.id), 'foreign_id': self.foreign_id, 'is_admin': self.is_admin, 'email': self.email, 'name': self.name, 'type': self.type }) return data
class Export(db.Model, IdModel, DatedModel): """A data export run in the background. The data is stored in a cloud storage bucket and the user is given a link to download the data. The link expires after a fixed duration and the exported data is deleted.""" DEFAULT_EXPIRATION = timedelta(days=30) # After 30 days label = db.Column(db.Unicode) operation = db.Column(db.Unicode) creator_id = db.Column(db.Integer, db.ForeignKey("role.id")) creator = db.relationship(Role, backref=db.backref("exports", lazy="dynamic")) collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True, nullable=True) collection = db.relationship(Collection, backref=db.backref("exports", lazy="dynamic")) expires_at = db.Column(db.DateTime, default=None, nullable=True) deleted = db.Column(db.Boolean, default=False) status = db.Column("export_status", db.Unicode, default=Status.DEFAULT) content_hash = db.Column(db.Unicode(65), index=True, nullable=True) file_size = db.Column(db.BigInteger, nullable=True) # In bytes file_name = db.Column(db.Unicode, nullable=True) mime_type = db.Column(db.Unicode) meta = db.Column(JSONB, default={}) def to_dict(self): data = self.to_dict_dates() data.update({ "id": stringify(self.id), "label": self.label, "operation": self.operation, "creator_id": stringify(self.creator_id), "collection_id": self.collection_id, "expires_at": self.expires_at, "deleted": self.deleted, "status": Status.LABEL.get(self.status), "content_hash": self.content_hash, "file_size": self.file_size, "file_name": self.file_name, "mime_type": self.mime_type, "meta": self.meta, }) return data @classmethod def create(cls, operation, role_id, label, collection=None, mime_type=None, meta=None): export = cls() export.creator_id = role_id export.operation = operation export.label = label if collection is not None: export.collection_id = collection.id export.mime_type = mime_type export.expires_at = datetime.utcnow() + cls.DEFAULT_EXPIRATION export.meta = meta or {} db.session.add(export) return export @property def namespace(self): return make_key("role", self.creator_id) def set_status(self, status): self.status = status db.session.add(self) def should_delete_publication(self): """Check whether the published export should be deleted from the archive Since we store exports by contenthash, there may be other non-expired exports that point to the same file in the archive""" q = (Export.all().filter( Export.content_hash == self.content_hash).filter( Export.deleted.isnot(True)).filter(Export.id != self.id)) return q.first() is None @classmethod def get_expired(cls, deleted=False): now = datetime.utcnow() q = cls.all() q = q.filter(cls.expires_at <= now) if not deleted: q = q.filter(cls.deleted == deleted) return q @classmethod def get_pending(cls): q = cls.all() q = q.filter(cls.status == Status.PENDING) q = q.filter(cls.deleted == False) # noqa return q @classmethod def by_id(cls, id, role_id=None, deleted=False): q = cls.all().filter_by(id=id) if role_id is not None: q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) # noqa return q.first() @classmethod def by_role_id(cls, role_id, deleted=False): q = cls.all() q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) # noqa q = q.order_by(cls.created_at.desc()) return q @classmethod def by_content_hash(cls, content_hash, deleted=False): q = cls.all() q = q.filter(cls.content_hash == content_hash) if not deleted: q = q.filter(cls.deleted == False) # noqa return q def __repr__(self): return "<Export(%r, %r, %r)>" % (self.id, self.creator_id, self.label)
class Document(db.Model, DatedModel): SCHEMA = 'Document' SCHEMA_FOLDER = 'Folder' SCHEMA_TABLE = 'Table' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=True, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True, index=True) schema = db.Column(db.String(255), nullable=False) meta = db.Column(JSONB, default={}) uploader_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) # noqa parent_id = db.Column(db.BigInteger, db.ForeignKey('document.id'), nullable=True, index=True) # noqa collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), nullable=False, index=True) # noqa collection = db.relationship(Collection, backref=db.backref('documents', lazy='dynamic')) # noqa def __init__(self, **kw): self.meta = {} super(Document, self).__init__(**kw) @property def model(self): return model.get(self.schema) @property def ancestors(self): if self.parent_id is None: return [] key = cache.key('ancestors', self.id) ancestors = cache.get_list(key) if len(ancestors): return ancestors parent_key = cache.key('ancestors', self.parent_id) ancestors = cache.get_list(parent_key) if not len(ancestors): ancestors = [] parent = Document.by_id(self.parent_id) if parent is not None: ancestors = parent.ancestors ancestors.append(self.parent_id) if self.model.is_a(model.get(self.SCHEMA_FOLDER)): cache.set_list(key, ancestors, expire=cache.EXPIRE) return ancestors def update(self, data): props = ('title', 'summary', 'author', 'crawler', 'source_url', 'file_name', 'mime_type', 'headers', 'date', 'authored_at', 'modified_at', 'published_at', 'retrieved_at', 'languages', 'countries', 'keywords') for prop in props: self.meta[prop] = data.get(prop, self.meta.get(prop)) flag_modified(self, 'meta') def delete(self, deleted_at=None): db.session.delete(self) @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) @classmethod def save(cls, collection, parent=None, foreign_id=None, content_hash=None, meta=None, uploader_id=None): """Try and find a document by various criteria.""" q = cls.all() q = q.filter(Document.collection_id == collection.id) if parent is not None: q = q.filter(Document.parent_id == parent.id) if foreign_id is not None: q = q.filter(Document.foreign_id == foreign_id) elif content_hash is not None: q = q.filter(Document.content_hash == content_hash) else: raise ValueError("No unique criterion for document.") document = q.first() if document is None: document = cls() document.schema = cls.SCHEMA document.collection_id = collection.id document.uploader_id = uploader_id if parent is not None: document.parent_id = parent.id if foreign_id is not None: document.foreign_id = foreign_id document.content_hash = content_hash if content_hash is None: document.schema = cls.SCHEMA_FOLDER if meta is not None: document.update(meta) db.session.add(document) return document @classmethod def by_id(cls, id, collection_id=None): try: id = int(id) except Exception: return q = cls.all() q = q.filter(cls.id == id) if collection_id is not None: q = q.filter(cls.collection_id == collection_id) return q.first() @classmethod def by_collection(cls, collection_id=None): q = cls.all() q = q.filter(cls.collection_id == collection_id) return q @classmethod def cleanup_deleted(cls): q = db.session.query(Collection.id) q = q.filter(Collection.deleted_at != None) # noqa collection_ids = [c for (c, ) in q.all()] pq = db.session.query(cls) pq = pq.filter(cls.collection_id.in_(collection_ids)) pq.delete(synchronize_session=False) def to_proxy(self): proxy = model.get_proxy({ 'id': str(self.id), 'schema': self.model, 'properties': {} }) meta = dict(self.meta) headers = meta.pop('headers', {}) or {} headers = {slugify(k, sep='_'): v for k, v in headers.items()} proxy.set('contentHash', self.content_hash) proxy.set('parent', self.parent_id) proxy.set('ancestors', self.ancestors) proxy.set('crawler', meta.get('crawler')) proxy.set('sourceUrl', meta.get('source_url')) proxy.set('title', meta.get('title')) proxy.set('fileName', meta.get('file_name')) if not proxy.has('fileName'): disposition = headers.get('content_disposition') if disposition is not None: _, attrs = cgi.parse_header(disposition) proxy.set('fileName', attrs.get('filename')) proxy.set('mimeType', meta.get('mime_type')) if not proxy.has('mimeType'): proxy.set('mimeType', headers.get('content_type')) proxy.set('language', meta.get('languages')) proxy.set('country', meta.get('countries')) proxy.set('keywords', meta.get('keywords')) proxy.set('headers', registry.json.pack(headers), quiet=True) proxy.set('authoredAt', meta.get('authored_at')) proxy.set('modifiedAt', meta.get('modified_at')) proxy.set('publishedAt', meta.get('published_at')) proxy.set('retrievedAt', meta.get('retrieved_at')) proxy.set('indexUpdatedAt', self.created_at) proxy.set('sourceUrl', meta.get('source_url')) return proxy def __repr__(self): return '<Document(%r,%r)>' % (self.id, self.schema)
class Entity(db.Model, UuidModel, SoftDeleteModel): STATE_ACTIVE = 'active' STATE_PENDING = 'pending' STATE_DELETED = 'deleted' name = db.Column(db.Unicode) type = db.Column(db.String(255), index=True) state = db.Column(db.String(128), nullable=True, default=STATE_ACTIVE, index=True) # noqa foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.entity_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_identities(self): pq = db.session.query(EntityIdentity) pq = pq.filter(EntityIdentity.entity_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() self.delete_identities() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) self.state = self.STATE_DELETED super(Entity, self).delete(deleted_at=deleted_at) @classmethod def delete_dangling(cls, collection_id): """Delete dangling entities. Entities can dangle in pending state while they have no references pointing to them, thus making it impossible to enable them. This is a routine cleanup function. """ q = db.session.query(cls) q = q.filter(cls.collection_id == collection_id) q = q.filter(cls.state == cls.STATE_PENDING) q = q.outerjoin(Reference) q = q.group_by(cls) q = q.having(func.count(Reference.id) == 0) for entity in q.all(): entity.delete() def merge(self, other): if self.id == other.id: raise ValueError("Cannot merge an entity with itself.") if self.collection_id != other.collection_id: raise ValueError( "Cannot merge entities from different collections.") # noqa data = merge_data(self.data, other.data) if self.name.lower() != other.name.lower(): data = merge_data(data, {'alias': [other.name]}) self.data = data self.state = self.STATE_ACTIVE self.foreign_ids = self.foreign_ids or [] self.foreign_ids += other.foreign_ids or [] self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() # update alerts from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({'entity_id': self.id}) # update document references from aleph.model.reference import Reference q = db.session.query(Reference).filter(Reference.entity_id == other.id) q.update({'entity_id': self.id}) # delete source entities other.delete() db.session.add(self) db.session.commit() db.session.refresh(other) def update(self, entity): data = entity.get('data') or {} data['name'] = entity.get('name') self.data = self.schema.validate(data) self.name = self.data.pop('name') fid = [string_value(f) for f in entity.get('foreign_ids') or []] self.foreign_ids = list(set([f for f in fid if f is not None])) self.state = entity.pop('state', self.STATE_ACTIVE) self.updated_at = datetime.utcnow() db.session.add(self) @classmethod def save(cls, data, collection, merge=False): ent = cls.by_id(data.get('id')) if ent is None: ent = cls() ent.type = data.pop('schema', None) if ent.type is None: raise ValueError("No schema provided.") ent.id = make_textid() if merge: data = merge_data(data, ent.to_dict()) if collection is None: raise ValueError("No collection specified.") ent.collection = collection ent.update(data) return ent @classmethod def filter_collections(cls, q, collections=None): if collections is None: return q collection_ids = [] for collection in collections: if isinstance(collection, Collection): collection = collection.id collection_ids.append(collection) q = q.filter(Entity.collection_id.in_(collection_ids)) return q @classmethod def by_id_set(cls, ids, collections=None): if not len(ids): return {} q = cls.all() q = cls.filter_collections(q, collections=collections) q = q.options(joinedload('collection')) q = q.filter(cls.id.in_(ids)) entities = {} for ent in q: entities[ent.id] = ent return entities @classmethod def by_foreign_id(cls, foreign_id, collection_id, deleted=False): foreign_id = string_value(foreign_id) if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast([foreign_id], ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.state == cls.STATE_ACTIVE) return q.scalar() @property def schema(self): return schemata.get(self.type) @property def terms(self): terms = set([self.name]) for alias in ensure_list(self.data.get('alias')): if alias is not None and len(alias): terms.add(alias) return terms @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([normalize_strong(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms def to_dict(self): data = super(Entity, self).to_dict() data.update({ 'schema': self.type, 'name': self.name, 'state': self.state, 'data': self.data, 'foreign_ids': self.foreign_ids or [], 'collection_id': self.collection_id }) return data def to_index(self): entity = self.to_dict() entity['properties'] = {'name': [self.name]} for k, v in self.data.items(): v = ensure_list(v) if len(v): entity['properties'][k] = v return entity def to_ref(self): return { 'id': self.id, 'label': self.name, 'schema': self.type, 'collection_id': self.collection_id } def __unicode__(self): return self.name def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class Document(db.Model, DatedModel): _schema = 'document.json#' SCHEMA = 'Document' TYPE_TEXT = 'text' TYPE_TABULAR = 'tabular' TYPE_OTHER = 'other' STATUS_PENDING = 'pending' STATUS_SUCCESS = 'success' STATUS_FAIL = 'fail' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=False, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True) type = db.Column(db.Unicode(10), nullable=False, index=True) status = db.Column(db.Unicode(10), nullable=True, index=True) _meta = db.Column('meta', JSONB) crawler = db.Column(db.Unicode(), index=True) crawler_run = db.Column(db.Unicode()) error_type = db.Column(db.Unicode(), nullable=True) error_message = db.Column(db.Unicode(), nullable=True) error_details = db.Column(db.Unicode(), nullable=True) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), nullable=False, index=True) # noqa collection = db.relationship(Collection, backref=db.backref('documents', lazy='dynamic')) # noqa @property def title(self): return self.meta.title @hybrid_property def meta(self): self._meta = self._meta or {} self._meta['content_hash'] = self.content_hash self._meta['foreign_id'] = self.foreign_id self._meta['crawler'] = self.crawler self._meta['crawler_run'] = self.crawler_run return Metadata.from_data(self._meta or {}) @meta.setter def meta(self, meta): if isinstance(meta, Metadata): self.content_hash = meta.content_hash self.foreign_id = meta.foreign_id self.crawler = meta.crawler self.crawler_run = meta.crawler_run meta = meta.to_attr_dict() self._meta = meta flag_modified(self, '_meta') def update(self, data): validate(data, self._schema) meta = self.meta meta.update(data, safe=True) self.meta = meta db.session.add(self) def delete_pages(self): pq = db.session.query(DocumentPage) pq = pq.filter(DocumentPage.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_records(self): pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.document_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() self.delete_records() self.delete_pages() db.session.delete(self) def insert_records(self, sheet, iterable, chunk_size=1000): chunk = [] for i, data in enumerate(iterable): chunk.append({ 'document_id': self.id, 'row_id': i, 'sheet': sheet, 'data': data }) if len(chunk) >= chunk_size: db.session.bulk_insert_mappings(DocumentRecord, chunk) chunk = [] if len(chunk): db.session.bulk_insert_mappings(DocumentRecord, chunk) def text_parts(self): if self.type == self.TYPE_TEXT: for page in self.pages: for text in page.text_parts(): yield text elif self.type == self.TYPE_TABULAR: for record in self.records: for text in record.text_parts(): yield text @classmethod def crawler_last_run(cls, crawler_id): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.crawler == crawler_id) return q.scalar() @classmethod def is_crawler_active(cls, crawler_id): # TODO: add a function to see if a particular crawl is still running # this should be defined as having "pending" documents. last_run_time = cls.crawler_last_run(crawler_id) if last_run_time is None: return False return last_run_time > (datetime.utcnow() - timedelta(hours=1)) @classmethod def crawler_stats(cls, crawler_id): # Check if the crawler was active very recently, if so, don't # allow the user to execute a new run right now. stats = { 'updated': cls.crawler_last_run(crawler_id), 'running': cls.is_crawler_active(crawler_id) } q = db.session.query(cls.status, func.count(cls.id)) q = q.filter(cls.crawler == crawler_id) q = q.group_by(cls.status) for (status, count) in q.all(): stats[status] = count return stats def _add_to_dict(self, data): try: from flask import request source_id = self.collection_id data['public'] = request.authz.collection_public(source_id) except: data['public'] = None data.update({ 'id': self.id, 'type': self.type, 'status': self.status, 'error_type': self.error_type, 'error_message': self.error_message, 'error_details': self.error_details, 'collection_id': self.collection_id, 'created_at': self.created_at, 'updated_at': self.updated_at }) return data def to_dict(self): data = self.meta.to_dict() return self._add_to_dict(data) def to_index_dict(self): data = self.meta.to_index_dict() data['text'] = index_form(self.text_parts()) data['schema'] = self.SCHEMA data['schemata'] = [self.SCHEMA] data['name_sort'] = ascii_text(data.get('title')) data['title_latin'] = ascii_text(data.get('title')) data['summary_latin'] = ascii_text(data.get('summary')) return self._add_to_dict(data) def __repr__(self): return '<Document(%r,%r,%r)>' % (self.id, self.type, self.title)
class Alert(db.Model, SoftDeleteModel): """A subscription to notifications on a given query.""" __tablename__ = 'alert' id = db.Column(db.Integer, primary_key=True) query = db.Column(db.Unicode, nullable=True) notified_at = db.Column(db.DateTime, nullable=True) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) role = db.relationship(Role, backref=db.backref('alerts', lazy='dynamic')) # noqa @property def normalized(self): return normalize(self.query) def delete(self, deleted_at=None): self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) db.session.flush() def update(self): self.notified_at = datetime.utcnow() db.session.add(self) db.session.flush() def is_same(self, other): if other.role_id != self.role_id: return False if other.normalized != self.normalized: return False return True @classmethod def by_id(cls, id, role_id=None): q = cls.all().filter_by(id=id) if role_id is not None: q = q.filter(cls.role_id == role_id) return q.first() @classmethod def by_role_id(cls, role_id): q = cls.all() q = q.filter(cls.role_id == role_id) q = q.order_by(cls.created_at.desc()) q = q.order_by(cls.id.desc()) return q @classmethod def create(cls, data, role_id): alert = cls() alert.role_id = role_id alert.query = stringify(data.get('query')) alert.update() return alert @classmethod def dedupe(cls): alerts = cls.all() for (left, right) in permutations(alerts, 2): if left.id >= right.id: continue if left.is_same(right): left.delete() def __repr__(self): return '<Alert(%r, %r)>' % (self.id, self.query)
class Collection(db.Model, IdModel, SoftDeleteModel): """A set of documents and entities against which access control is enforced.""" # Category schema for collections. # TODO: add extra weight info. # TODO: should this be configurable? CATEGORIES = { 'news': 'News archives', 'leak': 'Leaks', 'land': 'Land registry', 'gazette': 'Gazettes', 'court': 'Court archives', 'company': 'Company registries', 'watchlist': 'Watchlists', 'investigation': 'Personal collections', 'sanctions': 'Sanctions lists', 'scrape': 'Scrapes', 'procurement': 'Procurement', 'grey': 'Grey literature', 'license': 'Licenses and concessions', 'regulatory': 'Regulatory filings', 'other': 'Other material' } label = db.Column(db.Unicode) summary = db.Column(db.Unicode, nullable=True) category = db.Column(db.Unicode, nullable=True) countries = db.Column(ARRAY(db.Unicode()), default=[]) languages = db.Column(ARRAY(db.Unicode()), default=[]) foreign_id = db.Column(db.Unicode, unique=True, nullable=False) # Managed collections are generated by API crawlers and thus UI users # shouldn't be allowed to add entities or documents to them. They also # don't use advanced entity extraction features for performance reasons. managed = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) creator = db.relationship(Role) def update(self, data, creator=None): self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.category = data.get('category', self.category) self.managed = data.get('managed', False) self.countries = data.get('countries', []) if creator is None: creator = Role.by_id(data.get('creator_id')) self.creator = creator self.updated_at = datetime.utcnow() db.session.add(self) db.session.flush() if creator is not None: Permission.grant(self, creator, True, True) @property def roles(self): if not hasattr(self, '_roles'): q = db.session.query(Permission.role_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.collection_id == self.id) # noqa q = q.filter(Permission.read == True) # noqa self._roles = [e.role_id for e in q.all()] return self._roles @classmethod def by_foreign_id(cls, foreign_id, deleted=False): if foreign_id is None: return q = cls.all(deleted=deleted) return q.filter(cls.foreign_id == foreign_id).first() @classmethod def all_by_ids(cls, ids, deleted=False, authz=None): q = super(Collection, cls).all_by_ids(ids, deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def create(cls, data, role=None): foreign_id = data.get('foreign_id') or make_textid() collection = cls.by_foreign_id(foreign_id, deleted=True) if collection is None: collection = cls() collection.foreign_id = foreign_id collection.update(data, creator=role) collection.deleted_at = None return collection def __repr__(self): return '<Collection(%r, %r, %r)>' % \ (self.id, self.foreign_id, self.label)
class Entity(db.Model, DatedModel): THING = "Thing" LEGAL_ENTITY = "LegalEntity" id = db.Column( db.String(ENTITY_ID_LEN), primary_key=True, default=make_textid, nullable=False, unique=False, ) schema = db.Column(db.String(255), index=True) data = db.Column("data", JSONB) role_id = db.Column(db.Integer, db.ForeignKey("role.id"), nullable=True) # noqa collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True) collection = db.relationship(Collection, backref=db.backref("entities", lazy="dynamic")) @property def model(self): return model.get(self.schema) def update(self, data, collection): proxy = model.get_proxy(data, cleaned=False) proxy = collection.ns.apply(proxy) self.id = collection.ns.sign(self.id) self.schema = proxy.schema.name self.updated_at = datetime.utcnow() previous = self.to_proxy() for prop in proxy.schema.properties.values(): # Do not allow the user to overwrite hashes because this could # lead to a user accessing random objects. if prop.type == registry.checksum: prev = previous.get(prop) proxy.set(prop, prev, cleaned=True, quiet=True) self.data = proxy.properties db.session.add(self) def to_proxy(self): data = { "id": self.id, "schema": self.schema, "properties": self.data, "created_at": iso_text(self.created_at), "updated_at": iso_text(self.updated_at), "role_id": self.role_id, "mutable": True, } return model.get_proxy(data, cleaned=False) @classmethod def create(cls, data, collection, role_id=None): entity = cls() entity_id = data.get("id") or make_textid() if not registry.entity.validate(entity_id): raise InvalidData(gettext("Invalid entity ID")) entity.id = collection.ns.sign(entity_id) entity.collection_id = collection.id entity.role_id = role_id entity.update(data, collection) return entity @classmethod def by_id(cls, entity_id, collection=None): q = cls.all().filter(cls.id == entity_id) if collection is not None: q = q.filter(cls.collection_id == collection.id) return q.first() @classmethod def by_collection(cls, collection_id): q = cls.all() q = q.filter(Entity.collection_id == collection_id) q = q.yield_per(5000) return q @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) def __repr__(self): return "<Entity(%r, %r)>" % (self.id, self.schema)
class Collection(db.Model, IdModel, SoftDeleteModel): """A set of documents and entities against which access control is enforced.""" label = db.Column(db.Unicode) summary = db.Column(db.Unicode, nullable=True) category = db.Column(db.Unicode, nullable=True) countries = db.Column(ARRAY(db.Unicode()), default=[]) languages = db.Column(ARRAY(db.Unicode()), default=[]) foreign_id = db.Column(db.Unicode, unique=True, nullable=False) # Managed collections are generated by API crawlers and thus UI users # shouldn't be allowed to add entities or documents to them. They also # don't use advanced entity extraction features for performance reasons. managed = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) creator = db.relationship(Role) def update(self, data): self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.category = data.get('category', self.category) self.managed = data.get('managed', False) self.countries = data.get('countries', []) creator = data.get('creator') or {} self.update_creator(creator.get('id')) self.touch() def update_creator(self, role): """Set the creator (and admin) of a collection.""" if not isinstance(role, Role): role = Role.by_id(role) if role is None or role.type != Role.USER: return self.creator = role db.session.add(self) db.session.flush() Permission.grant(self, role, True, True) def touch(self): self.updated_at = datetime.utcnow() db.session.add(self) def delete_matches(self): pq = db.session.query(Match) pq = pq.filter( or_(Match.collection_id == self.id, Match.match_collection_id == self.id)) pq.delete(synchronize_session=False) def delete_permissions(self, deleted_at): pq = db.session.query(Permission) pq = pq.filter(Permission.collection_id == self.id) pq.update({Permission.deleted_at: deleted_at}, synchronize_session=False) def delete(self, deleted_at=None): self.delete_matches() self.delete_permissions(deleted_at=deleted_at) super(Collection, self).delete(deleted_at=deleted_at) @property def roles(self): if not hasattr(self, '_roles'): q = db.session.query(Permission.role_id) q = q.filter(Permission.collection_id == self.id) # noqa q = q.filter(Permission.read == True) # noqa self._roles = [e.role_id for e in q.all()] return self._roles @classmethod def by_foreign_id(cls, foreign_id, deleted=False): if foreign_id is None: return q = cls.all(deleted=deleted) return q.filter(cls.foreign_id == foreign_id).first() @classmethod def all_by_ids(cls, ids, deleted=False, authz=None): q = super(Collection, cls).all_by_ids(ids, deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def create(cls, data, role=None): foreign_id = data.get('foreign_id') or make_textid() collection = cls.by_foreign_id(foreign_id, deleted=True) if collection is None: collection = cls() collection.foreign_id = foreign_id collection.update(data) collection.update_creator(role) collection.deleted_at = None return collection def __repr__(self): return '<Collection(%r, %r, %r)>' % \ (self.id, self.foreign_id, self.label)
class EntitySet(db.Model, SoftDeleteModel): __tablename__ = "entityset" # set types LIST = "list" DIAGRAM = "diagram" TIMELINE = "timeline" PROFILE = "profile" TYPES = frozenset([LIST, DIAGRAM, TIMELINE, PROFILE]) id = db.Column(db.String(ENTITY_ID_LEN), primary_key=True) label = db.Column(db.Unicode) type = db.Column(db.String(10), index=True, default=LIST) summary = db.Column(db.Unicode, nullable=True) layout = db.Column("layout", JSONB, nullable=True) role_id = db.Column(db.Integer, db.ForeignKey("role.id"), index=True) role = db.relationship(Role) collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True) collection = db.relationship(Collection) parent_id = db.Column(db.String(ENTITY_ID_LEN), db.ForeignKey("entityset.id")) parent = db.relationship("EntitySet", backref="children", remote_side=[id]) @property def entities(self): q = db.session.query(EntitySetItem.entity_id) q = q.filter(EntitySetItem.entityset_id == self.id) q = q.filter(EntitySetItem.judgement == Judgement.POSITIVE) q = q.filter(EntitySetItem.deleted_at == None) # noqa return [entity_id for entity_id, in q.all()] @classmethod def create(cls, data, collection, authz): entityset = cls() entityset.id = make_textid() entityset.layout = {} entityset.role_id = authz.id entityset.collection_id = collection.id entityset.update(data) return entityset @classmethod def by_authz(cls, authz, types=None, prefix=None): ids = authz.collections(authz.READ) q = cls.by_type(types) q = q.filter(cls.collection_id.in_(ids)) if prefix is not None: q = q.filter(query_like(cls.label, prefix)) return q @classmethod def by_type(cls, types): """Retuns EntitySets of a particular type""" q = EntitySet.all() types = ensure_list(types) if len(types) and types != cls.TYPES: q = q.filter(EntitySet.type.in_(types)) return q @classmethod def by_collection_id(cls, collection_id, types=None): """Retuns EntitySets within a given collection_id""" q = cls.by_type(types) q = q.filter(EntitySet.collection_id == collection_id) return q @classmethod def by_entity_id(cls, entity_id, collection_ids=None, judgements=None, types=None, labels=None): """Retuns EntitySets that include EntitySetItems with the provided entity_id. NOTE: This only considers EntitySetItems who haven't been deleted """ q = cls.by_type(types) if labels is not None: q = q.filter(EntitySet.label.in_(ensure_list(labels))) q = q.join(EntitySetItem) q = q.filter(EntitySetItem.deleted_at == None) # NOQA q = q.filter(EntitySetItem.entity_id == entity_id) if collection_ids: q = q.filter(EntitySet.collection_id.in_(collection_ids)) if judgements is not None: q = q.filter(EntitySetItem.judgement.in_(ensure_list(judgements))) return q @classmethod def delete_by_collection(cls, collection_id, deleted_at): EntitySetItem.delete_by_collection(collection_id) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) def items(self, authz=None, deleted=False): q = EntitySetItem.all(deleted=deleted) if authz is not None: ids = authz.collections(authz.READ) q = q.filter(EntitySetItem.collection_id.in_(ids)) q = q.filter(EntitySetItem.entityset_id == self.id) q = q.order_by(EntitySetItem.created_at.asc()) return q def profile(self, judgements=None, deleted=False): q = self.items(deleted=deleted) if judgements is not None: q = q.filter(EntitySetItem.judgement.in_(judgements)) return q def merge(self, other, merged_by_id): """Merge two entity_sets into each other. The older one is retained. This tries to retain a state where there is only one judgement between a set and an entity. """ if other.id == self.id: return self if other.created_at > self.created_at: return other.merge(self, merged_by_id) local_items = {i.entity_id: i for i in self.items()} for remote in other.items(): local = local_items.get(remote.entity_id) if local is None: remote.entityset_id = self.id remote.updated_at = datetime.utcnow() db.session.add(remote) continue judgement = local.judgment + remote.judgement if judgement == local.judgement: remote.delete() continue origin = local.compared_to_entity_id or remote.compared_to_entity_id combined = EntitySetItem( entityset_id=self.id, entity_id=local.entity_id, collection_id=local.collection_id, added_by_id=merged_by_id, judgement=judgement, compared_to_entity_id=origin, ) db.session.add(combined) local.delete() remote.delete() other.delete() self.updated_at = datetime.utcnow() db.session.add(self) db.session.flush() return self def update(self, data): self.label = data.get("label", self.label) self.type = data.get("type", self.type) self.summary = data.get("summary", self.summary) self.layout = data.get("layout", self.layout) self.updated_at = datetime.utcnow() self.deleted_at = None db.session.add(self) def delete(self, deleted_at=None): pq = db.session.query(EntitySetItem) pq = pq.filter(EntitySetItem.entityset_id == self.id) pq = pq.filter(EntitySetItem.deleted_at == None) # noqa pq.update({EntitySetItem.deleted_at: deleted_at}, synchronize_session=False) for mapping in self.mappings: mapping.entityset_id = None db.session.add(mapping) self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) def to_dict(self): data = self.to_dict_dates() data.update({ "id": stringify(self.id), "type": self.type, "label": self.label, "summary": self.summary, "layout": self.layout, "role_id": stringify(self.role_id), "collection_id": stringify(self.collection_id), }) return data def __repr__(self): return "<EntitySet(%r, %r)>" % (self.id, self.collection_id)
class Notification(db.Model, IdModel, DatedModel): GLOBAL = 'Global' _event = db.Column('event', db.String(255), nullable=False) channels = db.Column(ARRAY(db.String(255)), index=True) params = db.Column(JSONB) actor_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) actor = db.relationship(Role) @hybrid_property def event(self): return Events.get(self._event) @event.setter def event(self, event): self._event = event.name def iterparams(self): if self.actor_id is not None: yield 'actor', Role, self.actor_id if self.event is None: return for name, clazz in self.event.params.items(): value = self.params.get(name) if value is not None: yield name, clazz, value def to_dict(self): data = self.to_dict_dates() data.update({ 'id': self.id, 'actor_id': self.actor_id, 'event': self._event, 'params': self.params }) return data @classmethod def publish(cls, event, actor_id=None, channels=[], params={}): notf = cls() notf.event = event notf.actor_id = actor_id notf.params = params notf.channels = list(set([c for c in channels if c is not None])) db.session.add(notf) return notf @classmethod def by_channels(cls, channels, role, since=None): channels = cast(channels, ARRAY(db.String(255))) q = cls.all() q = q.filter(cls.channels.overlap(channels)) q = q.filter(cls._event.in_(Events.names())) q = q.filter(or_( cls.actor_id != role.id, cls.actor_id == None # noqa )) since = since or role.notified_at if since is not None and role.notified_at is not None: since = max(since, role.notified_at) if since is not None: q = q.filter(cls.created_at >= since) q = q.order_by(cls.created_at.desc()) return q @classmethod def delete_by_channel(cls, channel): q = cls.all() q = q.filter(cls.channels.any(channel)) q.delete(synchronize_session=False)
class Entity(db.Model, SoftDeleteModel): THING = 'Thing' LEGAL_ENTITY = 'LegalEntity' id = db.Column(db.String(ENTITY_ID_LEN), primary_key=True, default=make_textid, nullable=False, unique=False) name = db.Column(db.Unicode) schema = db.Column(db.String(255), index=True) foreign_id = db.Column(db.Unicode) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa @property def model(self): return model.get(self.schema) @property def signed_id(self): return self.collection.ns.sign(self.id) def delete_matches(self): pq = db.session.query(Match) pq = pq.filter( or_(Match.entity_id == self.id, Match.match_id == self.id)) pq.delete(synchronize_session=False) db.session.refresh(self) def delete(self, deleted_at=None): self.delete_matches() deleted_at = deleted_at or datetime.utcnow() super(Entity, self).delete(deleted_at=deleted_at) def update(self, entity): proxy = model.get_proxy(entity) proxy.schema.validate(entity) self.schema = proxy.schema.name previous = self.to_proxy() for prop in proxy.iterprops(): # Do not allow the user to overwrite hashes because this could # lead to a user accessing random objects. if prop.type == registry.checksum: proxy.set(prop, previous.get(prop), cleaned=True, quiet=True) self.data = proxy.properties self.updated_at = datetime.utcnow() db.session.add(self) def to_proxy(self): proxy = model.get_proxy({ 'id': self.id, 'schema': self.schema, 'properties': self.data }) proxy.add('name', self.name) proxy.set('indexUpdatedAt', self.updated_at) return proxy @classmethod def create(cls, data, collection): foreign_id = data.get('foreign_id') ent = cls.by_foreign_id(foreign_id, collection.id, deleted=True) if ent is None: ent = cls() ent.id = make_textid() ent.collection = collection ent.foreign_id = foreign_id ent.data = {} ent.deleted_at = None ent.update(data) return ent @classmethod def by_id(cls, entity_id, collection_id=None): entity_id, _ = Namespace.parse(entity_id) q = cls.all() q = q.filter(cls.id == entity_id) return q.first() @classmethod def by_foreign_id(cls, foreign_id, collection_id, deleted=False): if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) q = q.filter(cls.foreign_id == foreign_id) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def by_collection(cls, collection_id): return cls.all().filter(Entity.collection_id == collection_id) @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): deleted_at = deleted_at or datetime.utcnow() entities = db.session.query(cls.id) entities = entities.filter(cls.collection_id == collection_id) entities = entities.subquery() pq = db.session.query(Match) pq = pq.filter(Match.entity_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.match_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class Collection(db.Model, IdModel, SoftDeleteModel): """A set of documents and entities against which access control is enforced.""" # Category schema for collections. # TODO: add extra weight info. # TODO: should this be configurable? CATEGORIES = { 'news': lazy_gettext('News archives'), 'leak': lazy_gettext('Leaks'), 'land': lazy_gettext('Land registry'), 'gazette': lazy_gettext('Gazettes'), 'court': lazy_gettext('Court archives'), 'company': lazy_gettext('Company registries'), 'watchlist': lazy_gettext('Watchlists'), 'investigation': lazy_gettext('Personal collections'), 'sanctions': lazy_gettext('Sanctions lists'), 'scrape': lazy_gettext('Scrapes'), 'procurement': lazy_gettext('Procurement'), 'grey': lazy_gettext('Grey literature'), 'license': lazy_gettext('Licenses and concessions'), 'regulatory': lazy_gettext('Regulatory filings'), 'other': lazy_gettext('Other material') } DEFAULT = 'other' label = db.Column(db.Unicode) summary = db.Column(db.Unicode, nullable=True) category = db.Column(db.Unicode, nullable=True) countries = db.Column(ARRAY(db.Unicode()), default=[]) languages = db.Column(ARRAY(db.Unicode()), default=[]) foreign_id = db.Column(db.Unicode, unique=True, nullable=False) publisher = db.Column(db.Unicode, nullable=True) publisher_url = db.Column(db.Unicode, nullable=True) info_url = db.Column(db.Unicode, nullable=True) data_url = db.Column(db.Unicode, nullable=True) # A casefile is a type of collection which is used to manage the state # of an investigation. Unlike normal collections, cases do not serve # as source material, but as a mechanism of analysis. casefile = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) creator = db.relationship(Role) def update(self, data, creator=None): self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.summary = data.get('summary', self.summary) self.publisher = data.get('publisher', self.publisher) self.publisher_url = data.get('publisher_url', self.publisher_url) self.info_url = data.get('info_url', self.info_url) self.data_url = data.get('data_url', self.data_url) self.category = data.get('category') or self.DEFAULT self.casefile = as_bool(data.get('casefile'), default=False) self.countries = data.get('countries', []) self.languages = data.get('languages', []) if creator is None: creator = Role.by_id(data.get('creator_id')) self.creator = creator self.updated_at = datetime.utcnow() db.session.add(self) db.session.flush() if creator is not None: Permission.grant(self, creator, True, True) @property def roles(self): if not hasattr(self, '_roles'): q = db.session.query(Permission.role_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.collection_id == self.id) # noqa q = q.filter(Permission.read == True) # noqa self._roles = [e.role_id for e in q.all()] return self._roles @property def kind(self): return 'casefile' if self.casefile else 'source' @classmethod def by_foreign_id(cls, foreign_id, deleted=False): if foreign_id is None: return q = cls.all(deleted=deleted) return q.filter(cls.foreign_id == foreign_id).first() @classmethod def all_by_ids(cls, ids, deleted=False, authz=None): q = super(Collection, cls).all_by_ids(ids, deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def create(cls, data, role=None): foreign_id = data.get('foreign_id') or make_textid() collection = cls.by_foreign_id(foreign_id, deleted=True) if collection is None: collection = cls() collection.foreign_id = foreign_id collection.update(data, creator=role) collection.deleted_at = None return collection def __repr__(self): return '<Collection(%r, %r, %r)>' % \ (self.id, self.foreign_id, self.label)
class Role(db.Model, IdModel, SoftDeleteModel): """A user, group or other access control subject.""" __tablename__ = 'role' USER = '******' GROUP = 'group' SYSTEM = 'system' TYPES = [USER, GROUP, SYSTEM] SYSTEM_GUEST = 'guest' SYSTEM_USER = '******' #: Generates URL-safe signatures for invitations. SIGNATURE = URLSafeTimedSerializer(settings.SECRET_KEY) #: Signature maximum age, defaults to 1 day SIGNATURE_MAX_AGE = 60 * 60 * 24 foreign_id = db.Column(db.Unicode(2048), nullable=False, unique=True) name = db.Column(db.Unicode, nullable=False) email = db.Column(db.Unicode, nullable=True) type = db.Column(db.Enum(*TYPES, name='role_type'), nullable=False) api_key = db.Column(db.Unicode, nullable=True) is_admin = db.Column(db.Boolean, nullable=False, default=False) is_muted = db.Column(db.Boolean, nullable=False, default=False) is_tester = db.Column(db.Boolean, nullable=False, default=False) is_blocked = db.Column(db.Boolean, nullable=False, default=False) password_digest = db.Column(db.Unicode, nullable=True) password = None reset_token = db.Column(db.Unicode, nullable=True) locale = db.Column(db.Unicode, nullable=True) permissions = db.relationship('Permission', backref='role') @property def has_password(self): return self.password_digest is not None @property def is_public(self): return self.id in self.public_roles() @property def is_alertable(self): if self.email is None: return False if self.is_muted is True: return False # TODO: ignore people that have not logged in for a certain time? return True @property def label(self): return anonymize_email(self.name, self.email) def update(self, data): self.name = data.get('name', self.name) self.is_muted = data.get('is_muted', self.is_muted) self.is_tester = data.get('is_tester', self.is_tester) if data.get('password'): self.set_password(data.get('password')) self.locale = stringify(data.get('locale', self.locale)) self.updated_at = datetime.utcnow() def clear_roles(self): """Removes any existing roles from group membership.""" self.roles = [] self.updated_at = datetime.utcnow() db.session.add(self) db.session.flush() def add_role(self, role): """Adds an existing role as a membership of a group.""" self.roles.append(role) db.session.add(role) db.session.add(self) self.updated_at = datetime.utcnow() def to_dict(self): data = self.to_dict_dates() data.update({ 'id': stringify(self.id), 'type': self.type, 'name': self.name, 'label': self.label, 'email': self.email, 'locale': self.locale, 'api_key': self.api_key, 'is_admin': self.is_admin, 'is_muted': self.is_muted, 'is_tester': self.is_tester, 'has_password': self.has_password, # 'notified_at': self.notified_at }) return data @classmethod def by_foreign_id(cls, foreign_id): if foreign_id is not None: return cls.all().filter_by(foreign_id=foreign_id).first() @classmethod def by_email(cls, email): if email is None: return None q = cls.all() q = q.filter(func.lower(cls.email) == email.lower()) return q.first() @classmethod def by_api_key(cls, api_key): if api_key is not None: return cls.all().filter_by(api_key=api_key).first() @classmethod def load_or_create(cls, foreign_id, type, name, email=None, is_admin=None): role = cls.by_foreign_id(foreign_id) if role is None: role = cls() role.foreign_id = foreign_id role.name = name or email role.type = type role.is_admin = False role.is_muted = False role.is_tester = False role.is_blocked = False role.notified_at = datetime.utcnow() if role.api_key is None: role.api_key = make_textid() if email is not None: role.email = email if is_admin is not None: role.is_admin = is_admin # see: https://github.com/alephdata/aleph/issues/111 auto_admins = [a.lower() for a in settings.ADMINS] if email is not None and email.lower() in auto_admins: role.is_admin = True db.session.add(role) db.session.flush() return role @classmethod def load_cli_user(cls): return cls.load_or_create(foreign_id=settings.SYSTEM_USER, name='Aleph', type=cls.USER, is_admin=True) @classmethod def load_id(cls, foreign_id): """Load a role and return the ID.""" if not hasattr(settings, '_roles'): settings._roles = {} if foreign_id not in settings._roles: role_id = cls.all_ids().filter_by(foreign_id=foreign_id).first() if role_id is not None: settings._roles[foreign_id] = role_id[0] return settings._roles.get(foreign_id) @classmethod def public_roles(cls): """Roles which make a collection to be considered public.""" return set([ cls.load_id(cls.SYSTEM_USER), cls.load_id(cls.SYSTEM_GUEST), ]) @classmethod def by_prefix(cls, prefix, exclude=[]): """Load a list of roles matching a name, email address, or foreign_id. :param str pattern: Pattern to match. """ query = prefix.replace('%', ' ').replace('_', ' ') query = '%%%s%%' % query q = cls.all() q = q.filter(Role.type == Role.USER) if len(exclude): q = q.filter(not_(Role.id.in_(exclude))) q = q.filter( or_( func.lower(cls.email) == prefix.lower(), cls.name.ilike(query))) q = q.order_by(Role.id.asc()) return q @classmethod def all_groups(cls, authz): q = cls.all() q = q.filter(Role.type == Role.GROUP) q = q.order_by(Role.name.asc()) q = q.order_by(Role.foreign_id.asc()) if not authz.is_admin: q = q.filter(Role.id.in_(authz.roles)) return q @classmethod def all_users(cls): return cls.all().filter(Role.type == Role.USER) @classmethod def all_system(cls): return cls.all().filter(Role.type == Role.SYSTEM) def set_password(self, secret): """Hashes and sets the role password. :param str secret: The password to be set. """ self.password_digest = generate_password_hash(secret) def check_password(self, secret): """Checks the password if it matches the role password hash. :param str secret: The password to be checked. :rtype: bool """ digest = self.password_digest or '' return check_password_hash(digest, secret) def __repr__(self): return '<Role(%r,%r)>' % (self.id, self.foreign_id)
class Entity(db.Model, UuidModel, SoftDeleteModel): THING = 'Thing' name = db.Column(db.Unicode) schema = db.Column(db.String(255), index=True) foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa @property def model(self): return model.get(self.schema) @property def terms(self): terms = set([self.name]) for alias in ensure_list(self.data.get('alias')): if alias is not None and len(alias): terms.add(alias) return terms @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([match_form(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms def delete_matches(self): pq = db.session.query(Match) pq = pq.filter( or_(Match.entity_id == self.id, Match.match_id == self.id)) pq.delete(synchronize_session=False) db.session.refresh(self) def delete(self, deleted_at=None): self.delete_matches() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) super(Entity, self).delete(deleted_at=deleted_at) @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): from aleph.model import Alert deleted_at = deleted_at or datetime.utcnow() entities = db.session.query(cls.id) entities = entities.filter(cls.collection_id == collection_id) entities = entities.subquery() pq = db.session.query(Alert) pq = pq.filter(Alert.entity_id.in_(entities)) pq.update({Alert.deleted_at: deleted_at}, synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.entity_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.match_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) def merge(self, other): if self.id == other.id: raise ValueError("Cannot merge an entity with itself.") if self.collection_id != other.collection_id: raise ValueError( "Cannot merge entities from different collections.") # noqa self.schema = model.precise_schema(self.schema, other.schema) self.foreign_ids = string_set(self.foreign_ids, self.foreign_ids) self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() data = merge_data(self.data, other.data) if self.name != other.name: data = merge_data(data, {'alias': [other.name]}) self.data = data # update alerts from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({Alert.entity_id: self.id}) # delete source entities other.delete() db.session.add(self) db.session.commit() db.session.refresh(other) def update(self, entity): self.schema = entity.get('schema') data = entity.get('properties') if is_mapping(data): data['name'] = [entity.get('name')] self.data = self.model.validate(data) elif self.data is None: self.data = {} self.data.pop('name', None) self.name = entity.get('name') # TODO: should this be mutable? # self.foreign_ids = string_set(entity.get('foreign_ids')) self.updated_at = datetime.utcnow() db.session.add(self) @classmethod def create(cls, data, collection): foreign_ids = string_set(data.get('foreign_ids')) ent = cls.by_foreign_ids(foreign_ids, collection.id, deleted=True) if ent is None: ent = cls() ent.id = make_textid() ent.collection = collection ent.foreign_ids = foreign_ids ent.update(data) ent.deleted_at = None return ent @classmethod def by_foreign_ids(cls, foreign_ids, collection_id, deleted=False): if not len(foreign_ids): return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast(foreign_ids, ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def all_ids(cls, deleted=False, authz=None): q = super(Entity, cls).all_ids(deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.collection_id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.deleted_at == None) # noqa return q.scalar() def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class Mapping(db.Model, SoftDeleteModel): """A mapping to load entities from a table""" __tablename__ = 'mapping' FAILED = 'failed' SUCCESS = 'success' STATUS = {SUCCESS: lazy_gettext('success'), FAILED: lazy_gettext('failed')} id = db.Column(db.Integer, primary_key=True) query = db.Column('query', JSONB) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) role = db.relationship(Role, backref=db.backref('mappings', lazy='dynamic')) # noqa collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('mappings', lazy='dynamic')) # noqa table_id = db.Column(db.String(ENTITY_ID_LEN), index=True) last_run_status = db.Column(db.Unicode, nullable=True) last_run_err_msg = db.Column(db.Unicode, nullable=True) def update(self, query=None, table_id=None): self.updated_at = datetime.utcnow() if query: self.query = query if table_id: self.table_id = table_id db.session.add(self) db.session.commit() def set_status(self, status, error=None): self.last_run_status = status self.last_run_err_msg = error db.session.add(self) db.session.commit() def delete(self, deleted_at=None): self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) db.session.commit() def to_dict(self): data = self.to_dict_dates() status = self.STATUS.get(self.last_run_status) data.update({ 'id': stringify(self.id), 'query': dict(self.query), 'role_id': stringify(self.role_id), 'collection_id': stringify(self.collection_id), 'table_id': self.table_id, 'last_run_status': status, 'last_run_err_msg': self.last_run_err_msg }) return data @classmethod def by_collection(cls, collection_id, table_id=None): q = cls.all().filter(cls.collection_id == collection_id) if table_id is not None: q = q.filter(cls.table_id == table_id) return q @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): deleted_at = deleted_at or datetime.utcnow() pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) @classmethod def create(cls, query, table_id, collection, role_id): mapping = cls() mapping.role_id = role_id mapping.query = query mapping.collection_id = collection.id mapping.table_id = table_id mapping.update() return mapping def __repr__(self): return '<Mapping(%r, %r)>' % (self.id, self.table_id)
def all_users(cls, has_email=False): q = cls.all().filter(Role.type == Role.USER) if has_email: q = q.filter(Role.email != None) # noqa return q def set_password(self, secret): """Hashes and sets the role password. :param str secret: The password to be set. """ self.password_digest = generate_password_hash(secret) def check_password(self, secret): """Checks the password if it matches the role password hash. :param str secret: The password to be checked. :rtype: bool """ return check_password_hash(self.password_digest or '', secret) def __repr__(self): return '<Role(%r,%r)>' % (self.id, self.foreign_id) Role.members = db.relationship(Role, secondary=membership, primaryjoin=Role.id == membership.c.group_id, secondaryjoin=Role.id == membership.c.member_id, backref="roles")