Пример #1
0
class EventLog(db.Model, IdModel, DatedModel):
    action = db.Column(db.Unicode(255), index=True)
    source_ip = db.Column(db.Unicode(255), nullable=True)
    path = db.Column(db.Unicode(), nullable=True)
    query = db.Column(JSONB)
    data = db.Column(JSONB)
    role_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True)

    @classmethod
    def emit(cls,
             action,
             path,
             source_ip=None,
             query=None,
             data=None,
             role_id=None):
        event = EventLog()
        event.action = action
        event.source_ip = source_ip
        event.path = path
        event.query = query
        event.data = data
        if role_id is not None:
            event.role_id = role_id
        db.session.add(event)
        return event

    def __repr__(self):
        return '<EventLog(%r, %r)>' % (self.id, self.action)
Пример #2
0
class DocumentTag(db.Model, IdModel):
    """A record reflects an entity or tag extracted from a document."""
    TEXT_LENGTH = 1024

    TYPE_PHONE = 'phone'
    TYPE_EMAIL = 'email'
    TYPE_PERSON = 'person'
    TYPE_ORGANIZATION = 'organization'
    TYPE_LOCATION = 'location'
    TYPE_IP = 'ip'
    TYPE_IBAN = 'iban'

    TYPES = {
        TYPE_PERSON: exactitude.names,
        TYPE_ORGANIZATION: exactitude.names,
        TYPE_EMAIL: exactitude.emails,
        TYPE_PHONE: exactitude.phones,
        TYPE_LOCATION: exactitude.addresses,
        TYPE_IP: exactitude.ips,
        TYPE_IBAN: exactitude.ibans,
    }

    id = db.Column(db.BigInteger, primary_key=True)
    origin = db.Column(db.Unicode(255), nullable=False, index=True)
    type = db.Column(db.Unicode(16), nullable=False)
    weight = db.Column(db.Integer, default=1)
    text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True)

    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        "Document", backref=db.backref('tags',
                                       cascade='all, delete-orphan'))  # noqa

    @property
    def field(self):
        type_ = self.TYPES[self.type]
        for (candidate, invert) in TYPES.values():
            if candidate == type_:
                return invert

    @classmethod
    def delete_by(cls, document_id=None, origin=None, type=None):
        pq = db.session.query(cls)
        assert document_id or origin or type
        if document_id is not None:
            pq = pq.filter(cls.document_id == document_id)
        if origin is not None:
            pq = pq.filter(cls.origin == origin)
        if type is not None:
            pq = pq.filter(cls.type == type)
        pq.delete()
        db.session.flush()

    def __repr__(self):
        return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
Пример #3
0
class DocumentTag(db.Model, IdModel):
    """A record reflects an entity or tag extracted from a document."""
    TEXT_LENGTH = 1024

    TYPE_PHONE = 'phone'
    TYPE_EMAIL = 'email'
    TYPE_PERSON = 'person'
    TYPE_ORGANIZATION = 'organization'
    TYPE_LOCATION = 'location'
    TYPE_IP = 'ip'
    TYPE_IBAN = 'iban'
    TYPE_COUNTRY = 'country'
    TYPE_LANGUAGE = 'language'

    MAPPING = {
        TYPE_PERSON: 'namesMentioned',
        TYPE_ORGANIZATION: 'namesMentioned',
        TYPE_EMAIL: 'emailMentioned',
        TYPE_PHONE: 'phoneMentioned',
        TYPE_LOCATION: 'locationMentioned',
        TYPE_IP: 'ipMentioned',
        TYPE_IBAN: 'ibanMentioned',
        TYPE_COUNTRY: 'country',
        TYPE_LANGUAGE: 'language'
    }

    id = db.Column(db.BigInteger, primary_key=True)
    origin = db.Column(db.Unicode(255), nullable=False, index=True)
    type = db.Column(db.Unicode(16), nullable=False)
    weight = db.Column(db.Integer, default=1)
    text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True)

    document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True)  # noqa
    document = db.relationship("Document", backref=db.backref('tags', cascade='all, delete-orphan'))  # noqa

    @property
    def field(self):
        type_ = registry.get(self.type)
        if type_ is not None and type_.group is not None:
            return type_.group

    @classmethod
    def delete_by(cls, document_id=None, origin=None, type=None):
        pq = db.session.query(cls)
        assert document_id or origin or type
        if document_id is not None:
            pq = pq.filter(cls.document_id == document_id)
        if origin is not None:
            pq = pq.filter(cls.origin == origin)
        if type is not None:
            pq = pq.filter(cls.type == type)
        pq.delete()
        db.session.flush()

    def __repr__(self):
        return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
Пример #4
0
class EntityAsset(Entity):
    _schema = 'entity/asset.json#'
    __mapper_args__ = {'polymorphic_identity': _schema}

    valuation = db.Column(db.Integer, nullable=True)
    valuation_currency = db.Column(db.Unicode(100), nullable=True)
    valuation_date = db.Column(db.Unicode, nullable=True)
Пример #5
0
class Link(db.Model, UuidModel, SoftDeleteModel):
    type = db.Column(db.String(255), index=True)
    source_id = db.Column(db.String(254), index=True)
    target_id = db.Column(db.String(254), index=True)
    foreign_ids = db.Column(ARRAY(db.Unicode()))
    data = db.Column('data', JSONB)

    collection_id = db.Column(db.Integer,
                              db.ForeignKey('collection.id'),
                              index=True)  # noqa
    collection = db.relationship(Collection,
                                 backref=db.backref('links',
                                                    lazy='dynamic'))  # noqa

    @property
    def schema(self):
        return schemata.get(self.type)

    def to_dict(self):
        data = super(Link, self).to_dict()
        data.update({
            'schema': self.type,
            'data': self.data,
            'foreign_ids': self.foreign_ids or [],
            'collection_id': self.collection_id
        })
        return data

    def __repr__(self):
        return '<Link(%r, %r, %r)>' % (self.id, self.source_id, self.target_id)
Пример #6
0
class Selector(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    _text = db.Column('text', db.Unicode, index=True)
    normalized = db.Column(db.Unicode, index=True)

    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime,
                           default=datetime.utcnow,
                           onupdate=datetime.utcnow)

    entity_id = db.Column(db.Unicode(50), db.ForeignKey('entity.id'))
    entity = db.relationship('Entity',
                             backref=db.backref(
                                 'selectors',
                                 lazy='dynamic',
                                 cascade='all, delete-orphan'))  # noqa

    @hybrid_property
    def text(self):
        return self._text

    @text.setter
    def text(self, text):
        self._text = text
        self.normalized = self.normalize(text)

    @classmethod
    def normalize(cls, text):
        return normalize(text)

    def __repr__(self):
        return '<Selector(%r, %r)>' % (self.entity_id, self.text)

    def __unicode__(self):
        return self.text
Пример #7
0
class DocumentPage(db.Model):

    id = db.Column(db.BigInteger, primary_key=True)
    number = db.Column(db.Integer(), nullable=False)
    text = db.Column(db.Unicode(), nullable=False)
    document_id = db.Column(db.Integer(), db.ForeignKey('document.id'))
    document = db.relationship(
        Document, backref=db.backref('pages',
                                     cascade='all, delete-orphan'))  # noqa

    def __repr__(self):
        return '<DocumentPage(%r,%r)>' % (self.document_id, self.number)

    def text_parts(self):
        """Utility method to get all text snippets in a record."""
        if self.text is not None and len(self.text):
            yield self.text

    def to_dict(self):
        return {
            'id': self.id,
            'number': self.number,
            'text': self.text,
            'document_id': self.document_id
        }
Пример #8
0
class DocumentPage(db.Model):

    id = db.Column(db.BigInteger, primary_key=True)
    number = db.Column(db.Integer(), nullable=False)
    text = db.Column(db.Unicode(), nullable=False)
    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        Document, backref=db.backref('pages',
                                     cascade='all, delete-orphan'))  # noqa

    @property
    def tid(self):
        tid = sha1(str(self.document_id))
        tid.update(str(self.id))
        return tid.hexdigest()

    def __repr__(self):
        return '<DocumentPage(%r,%r)>' % (self.document_id, self.number)

    def text_parts(self):
        """Utility method to get all text snippets in a record."""
        text = string_value(self.text)
        if text is not None:
            yield self.text

    def to_dict(self):
        return {
            'id': self.id,
            'number': self.number,
            'text': self.text,
            'document_id': self.document_id
        }
Пример #9
0
class ProcessingLog(db.Model, DatedModel):
    """Report any events or errors during processing of documents."""

    id = db.Column(db.BigInteger, primary_key=True)
    operation = db.Column(db.Unicode, nullable=True, index=True)
    component = db.Column(db.Unicode, nullable=True, index=True)
    source_location = db.Column(db.Unicode, nullable=True, index=True)
    content_hash = db.Column(db.Unicode(65), nullable=True, index=True)
    foreign_id = db.Column(db.Unicode, unique=False, nullable=True)
    source_id = db.Column(db.Integer, nullable=True)
    document_id = db.Column(db.BigInteger, nullable=True)
    meta = db.Column(JSONB)
    error_type = db.Column(db.Unicode, nullable=True)
    error_message = db.Column(db.Unicode, nullable=True)
    error_details = db.Column(db.Unicode, nullable=True)

    @classmethod
    def log(cls,
            operation,
            component=None,
            source_location=None,
            content_hash=None,
            foreign_id=None,
            source_id=None,
            document_id=None,
            meta=None,
            error_type=None,
            error_message=None,
            error_details=None):

        meta = meta or {}
        obj = ProcessingLog()
        obj.operation = operation
        obj.component = component
        obj.source_id = source_id
        obj.document_id = document_id
        obj.meta = meta

        source_location = source_location or meta.get('source_path') or \
            meta.get('source_url') or meta.get('file_name')
        obj.source_location = source_location
        obj.content_hash = content_hash or meta.get('content_hash')
        obj.foreign_id = foreign_id or meta.get('foreign_id')

        obj.error_type = error_type
        obj.error_message = error_message
        obj.error_details = error_details

        session = db.create_scoped_session()
        session.add(obj)
        session.commit()
        session.remove()

    def __repr__(self):
        return '<ProcessingLog(%r,%r)>' % (self.id, self.content_hash)

    def __unicode__(self):
        return self.id
Пример #10
0
 def by_foreign_ids(cls, foreign_ids, collection_id, deleted=False):
     if not len(foreign_ids):
         return None
     q = cls.all(deleted=deleted)
     q = q.filter(Entity.collection_id == collection_id)
     foreign_id = func.cast(foreign_ids, ARRAY(db.Unicode()))
     q = q.filter(cls.foreign_ids.contains(foreign_id))
     q = q.order_by(Entity.deleted_at.desc().nullsfirst())
     return q.first()
Пример #11
0
 def by_foreign_id(cls, foreign_id, collection_id, deleted=False):
     foreign_id = string_value(foreign_id)
     if foreign_id is None:
         return None
     q = cls.all(deleted=deleted)
     q = q.filter(Entity.collection_id == collection_id)
     foreign_id = func.cast([foreign_id], ARRAY(db.Unicode()))
     q = q.filter(cls.foreign_ids.contains(foreign_id))
     q = q.order_by(Entity.deleted_at.desc().nullsfirst())
     return q.first()
Пример #12
0
class EntityTag(db.Model):
    id = db.Column(db.Integer(), primary_key=True)
    collection = db.Column(db.Unicode(100))
    package_id = db.Column(db.Unicode(100))

    entity_id = db.Column(db.Unicode(50), db.ForeignKey('entity.id'))
    entity = db.relationship(Entity,
                             backref=db.backref('tags', lazy='dynamic'))

    created_at = db.Column(db.DateTime, default=datetime.utcnow)

    @classmethod
    def delete_set(cls, collection, package_id):
        q = db.session.query(cls)
        q = q.filter_by(collection=collection)
        q = q.filter_by(package_id=package_id)
        q.delete()

    @classmethod
    def by_package(cls, collection, package_id):
        etag = aliased(cls)
        ent = aliased(Entity)
        q = db.session.query(etag.entity_id, ent.label, ent.category,
                             ent.list_id)
        q = q.join(ent, ent.id == etag.entity_id)
        q = q.filter(etag.collection == collection)
        q = q.filter(etag.package_id == package_id)
        entities = []
        for entity_id, label, category, lst in q.all():
            entities.append({
                'id': entity_id,
                'entity': entity_id,
                'label': label,
                'category': category,
                'list': lst
            })
        return entities

    def __repr__(self):
        return '<EntityTag(%r, %r)>' % (self.package_id, self.entity_id)
Пример #13
0
class DocumentTag(db.Model, IdModel):
    """A record reflects an entity or tag extracted from a document."""

    TYPE_PHONE = 'phone'
    TYPE_EMAIL = 'email'
    TYPE_PERSON = 'person'
    TYPE_ORGANIZATION = 'organization'
    TYPE_LOCATION = 'location'

    id = db.Column(db.BigInteger, primary_key=True)
    origin = db.Column(db.Unicode(255), nullable=False, index=True)
    type = db.Column(db.Unicode(16), nullable=False)
    weight = db.Column(db.Integer, default=1)
    key = db.Column(db.Unicode(1024), nullable=False, index=True)
    text = db.Column(db.Unicode(1024), nullable=True)

    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        "Document", backref=db.backref('tags',
                                       cascade='all, delete-orphan'))  # noqa

    @classmethod
    def delete_by(cls, document_id=None, origin=None, type=None):
        pq = db.session.query(cls)
        assert document_id or origin or type
        if document_id is not None:
            pq = pq.filter(cls.document_id == document_id)
        if origin is not None:
            pq = pq.filter(cls.origin == origin)
        if type is not None:
            pq = pq.filter(cls.type == type)
        pq.delete()
        db.session.flush()

    def __repr__(self):
        return '<DocumentTag(%r,%r)>' % (self.document_id, self.key)
Пример #14
0
class DocumentPage(db.Model):

    id = db.Column(db.BigInteger, primary_key=True)
    number = db.Column(db.Integer(), nullable=False)
    text = db.Column(db.Unicode(), nullable=False)
    document_id = db.Column(db.Integer(), db.ForeignKey('document.id'))
    document = db.relationship(
        Document, backref=db.backref('pages',
                                     cascade='all, delete-orphan'))  # noqa

    def __repr__(self):
        return '<DocumentPage(%r,%r)>' % (self.document_id, self.number)

    def to_dict(self):
        return {
            'id': self.id,
            'number': self.number,
            'text': self.text,
            'document_id': self.document_id
        }
Пример #15
0
 def find(cls,
          label=None,
          category=[],
          countries=[],
          managed=None,
          collection_id=None):
     q = db.session.query(cls)
     q = q.filter(cls.deleted_at == None)  # noqa
     if label and len(label.strip()):
         label = '%%%s%%' % label.strip()
         q = q.filter(cls.label.ilike(label))
     q = q.filter(cls.id.in_(collection_id))
     if len(category):
         q = q.filter(cls.category.in_(category))
     if len(countries):
         types = cast(countries, ARRAY(db.Unicode()))
         q = q.filter(cls.countries.contains(types))
     if managed is not None:
         q = q.filter(cls.managed == managed)
     return q
Пример #16
0
class Role(db.Model, IdModel, SoftDeleteModel):
    """A user, group or other access control subject."""
    __tablename__ = 'role'

    USER = '******'
    GROUP = 'group'
    SYSTEM = 'system'
    TYPES = [USER, GROUP, SYSTEM]

    SYSTEM_GUEST = 'guest'
    SYSTEM_USER = '******'

    #: Generates URL-safe signatures for invitations.
    SIGNATURE = URLSafeTimedSerializer(settings.SECRET_KEY)

    #: Signature maximum age, defaults to 1 day
    SIGNATURE_MAX_AGE = 60 * 60 * 24

    #: Password minimum length
    PASSWORD_MIN_LENGTH = 6

    foreign_id = db.Column(db.Unicode(2048), nullable=False, unique=True)
    name = db.Column(db.Unicode, nullable=False)
    email = db.Column(db.Unicode, nullable=True)
    api_key = db.Column(db.Unicode, nullable=True)
    is_admin = db.Column(db.Boolean, nullable=False, default=False)
    type = db.Column(db.Enum(*TYPES, name='role_type'), nullable=False)
    password_digest = db.Column(db.Unicode, nullable=True)
    password = None
    reset_token = db.Column(db.Unicode, nullable=True)
    permissions = db.relationship('Permission', backref='role')

    @property
    def has_password(self):
        return self.password_digest is not None

    def update(self, data):
        self.name = data.get('name', self.name)
        if data.get('password'):
            self.set_password(data.get('password'))

    def clear_roles(self):
        """Removes any existing roles from group membership."""
        self.roles = []
        db.session.add(self)

    def add_role(self, role):
        """Adds an existing role as a membership of a group."""
        self.roles.append(role)
        db.session.add(role)
        db.session.add(self)

    @classmethod
    def notifiable(cls):
        return cls.all_ids().filter(cls.email != None)  # noqa

    @classmethod
    def by_foreign_id(cls, foreign_id):
        if foreign_id is not None:
            return cls.all().filter_by(foreign_id=foreign_id).first()

    @classmethod
    def by_email(cls, email):
        if email:
            return cls.all().filter_by(email=email)

    @classmethod
    def by_api_key(cls, api_key):
        if api_key is not None:
            return cls.all().filter_by(api_key=api_key).first()

    @classmethod
    def load_or_create(cls, foreign_id, type, name, email=None, is_admin=None):
        role = cls.by_foreign_id(foreign_id)

        if role is None:
            role = cls()
            role.foreign_id = foreign_id
            role.name = name
            role.type = type
            role.is_admin = False

        if role.api_key is None:
            role.api_key = make_textid()

        role.email = email
        if is_admin is not None:
            role.is_admin = is_admin

        # see: https://github.com/alephdata/aleph/issues/111
        auto_admins = [a.lower() for a in settings.ADMINS]
        if email is not None and email.lower() in auto_admins:
            role.is_admin = True

        db.session.add(role)
        db.session.flush()

        return role

    @classmethod
    def load_id(cls, foreign_id, type=None, name=None):
        """Load a role and return the ID.

        If type is given and no role is found, a new role will be created.
        """
        if not hasattr(current_app, '_authz_roles'):
            current_app._authz_roles = {}
        if foreign_id not in current_app._authz_roles:
            role = cls.by_foreign_id(foreign_id)
            if role is None:
                if type is None:
                    return
                name = name or foreign_id
                role = cls.load_or_create(foreign_id, type, name)
            current_app._authz_roles[foreign_id] = role.id
        return current_app._authz_roles[foreign_id]

    @classmethod
    def public_roles(cls):
        """Roles which make a collection to be considered public."""
        return set([
            cls.load_id(cls.SYSTEM_USER),
            cls.load_id(cls.SYSTEM_GUEST),
        ])

    @classmethod
    def by_prefix(cls, prefix):
        """Load a list of roles matching a name, email address, or foreign_id.

        :param str pattern: Pattern to match.
        """
        q = cls.all()
        q = q.filter(Role.type == Role.USER)
        q = q.filter(
            or_(cls.foreign_id.ilike('%' + prefix + '%'),
                cls.email.ilike('%' + prefix + '%'),
                cls.name.ilike('%' + prefix + '%')))
        return q

    @classmethod
    def all_groups(cls):
        return cls.all().filter(Role.type != Role.USER)

    def set_password(self, secret):
        """Hashes and sets the role password.

        :param str secret: The password to be set.
        """
        self.password_digest = generate_password_hash(secret)

    def check_password(self, secret):
        """Checks the password if it matches the role password hash.

        :param str secret: The password to be checked.
        :rtype: bool
        """
        return check_password_hash(self.password_digest or '', secret)

    def __repr__(self):
        return '<Role(%r,%r)>' % (self.id, self.foreign_id)
Пример #17
0
class Entity(db.Model, UuidModel, SoftDeleteModel):
    STATE_ACTIVE = 'active'
    STATE_PENDING = 'pending'
    STATE_DELETED = 'deleted'

    name = db.Column(db.Unicode)
    type = db.Column(db.String(255), index=True)
    state = db.Column(db.String(128),
                      nullable=True,
                      default=STATE_ACTIVE,
                      index=True)  # noqa
    foreign_ids = db.Column(ARRAY(db.Unicode()))
    data = db.Column('data', JSONB)

    collection_id = db.Column(db.Integer,
                              db.ForeignKey('collection.id'),
                              index=True)  # noqa
    collection = db.relationship(Collection,
                                 backref=db.backref('entities',
                                                    lazy='dynamic'))  # noqa

    def delete_references(self, origin=None):
        pq = db.session.query(Reference)
        pq = pq.filter(Reference.entity_id == self.id)
        if origin is not None:
            pq = pq.filter(Reference.origin == origin)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def delete_identities(self):
        pq = db.session.query(EntityIdentity)
        pq = pq.filter(EntityIdentity.entity_id == self.id)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def delete(self, deleted_at=None):
        self.delete_references()
        self.delete_identities()
        deleted_at = deleted_at or datetime.utcnow()
        for alert in self.alerts:
            alert.delete(deleted_at=deleted_at)
        self.state = self.STATE_DELETED
        super(Entity, self).delete(deleted_at=deleted_at)

    @classmethod
    def delete_dangling(cls, collection_id):
        """Delete dangling entities.

        Entities can dangle in pending state while they have no references
        pointing to them, thus making it impossible to enable them. This is
        a routine cleanup function.
        """
        q = db.session.query(cls)
        q = q.filter(cls.collection_id == collection_id)
        q = q.filter(cls.state == cls.STATE_PENDING)
        q = q.outerjoin(Reference)
        q = q.group_by(cls)
        q = q.having(func.count(Reference.id) == 0)
        for entity in q.all():
            entity.delete()

    def merge(self, other):
        if self.id == other.id:
            raise ValueError("Cannot merge an entity with itself.")
        if self.collection_id != other.collection_id:
            raise ValueError(
                "Cannot merge entities from different collections.")  # noqa

        data = merge_data(self.data, other.data)
        if self.name.lower() != other.name.lower():
            data = merge_data(data, {'alias': [other.name]})

        self.data = data
        self.state = self.STATE_ACTIVE
        self.foreign_ids = self.foreign_ids or []
        self.foreign_ids += other.foreign_ids or []
        self.created_at = min((self.created_at, other.created_at))
        self.updated_at = datetime.utcnow()

        # update alerts
        from aleph.model.alert import Alert
        q = db.session.query(Alert).filter(Alert.entity_id == other.id)
        q.update({'entity_id': self.id})

        # update document references
        from aleph.model.reference import Reference
        q = db.session.query(Reference).filter(Reference.entity_id == other.id)
        q.update({'entity_id': self.id})

        # delete source entities
        other.delete()
        db.session.add(self)
        db.session.commit()
        db.session.refresh(other)

    def update(self, entity):
        data = entity.get('data') or {}
        data['name'] = entity.get('name')
        self.data = self.schema.validate(data)
        self.name = self.data.pop('name')
        fid = [string_value(f) for f in entity.get('foreign_ids') or []]
        self.foreign_ids = list(set([f for f in fid if f is not None]))
        self.state = entity.pop('state', self.STATE_ACTIVE)
        self.updated_at = datetime.utcnow()
        db.session.add(self)

    @classmethod
    def save(cls, data, collection, merge=False):
        ent = cls.by_id(data.get('id'))
        if ent is None:
            ent = cls()
            ent.type = data.pop('schema', None)
            if ent.type is None:
                raise ValueError("No schema provided.")
            ent.id = make_textid()

        if merge:
            data = merge_data(data, ent.to_dict())

        if collection is None:
            raise ValueError("No collection specified.")

        ent.collection = collection
        ent.update(data)
        return ent

    @classmethod
    def filter_collections(cls, q, collections=None):
        if collections is None:
            return q
        collection_ids = []
        for collection in collections:
            if isinstance(collection, Collection):
                collection = collection.id
            collection_ids.append(collection)
        q = q.filter(Entity.collection_id.in_(collection_ids))
        return q

    @classmethod
    def by_id_set(cls, ids, collections=None):
        if not len(ids):
            return {}
        q = cls.all()
        q = cls.filter_collections(q, collections=collections)
        q = q.options(joinedload('collection'))
        q = q.filter(cls.id.in_(ids))
        entities = {}
        for ent in q:
            entities[ent.id] = ent
        return entities

    @classmethod
    def by_foreign_id(cls, foreign_id, collection_id, deleted=False):
        foreign_id = string_value(foreign_id)
        if foreign_id is None:
            return None
        q = cls.all(deleted=deleted)
        q = q.filter(Entity.collection_id == collection_id)
        foreign_id = func.cast([foreign_id], ARRAY(db.Unicode()))
        q = q.filter(cls.foreign_ids.contains(foreign_id))
        q = q.order_by(Entity.deleted_at.desc().nullsfirst())
        return q.first()

    @classmethod
    def latest(cls):
        q = db.session.query(func.max(cls.updated_at))
        q = q.filter(cls.state == cls.STATE_ACTIVE)
        return q.scalar()

    @property
    def schema(self):
        return schemata.get(self.type)

    @property
    def terms(self):
        terms = set([self.name])
        for alias in ensure_list(self.data.get('alias')):
            if alias is not None and len(alias):
                terms.add(alias)
        return terms

    @property
    def regex_terms(self):
        # This is to find the shortest possible regex for each entity.
        # If, for example, and entity matches both "Al Qaeda" and
        # "Al Qaeda in Iraq, Syria and the Levant", it is useless to
        # search for the latter.
        terms = set([normalize_strong(t) for t in self.terms])
        regex_terms = set()
        for term in terms:
            if term is None or len(term) < 4 or len(term) > 120:
                continue
            contained = False
            for other in terms:
                if other is None or other == term:
                    continue
                if other in term:
                    contained = True
            if not contained:
                regex_terms.add(term)
        return regex_terms

    def to_dict(self):
        data = super(Entity, self).to_dict()
        data.update({
            'schema': self.type,
            'name': self.name,
            'state': self.state,
            'data': self.data,
            'foreign_ids': self.foreign_ids or [],
            'collection_id': self.collection_id
        })
        return data

    def to_index(self):
        entity = self.to_dict()
        entity['properties'] = {'name': [self.name]}
        for k, v in self.data.items():
            v = ensure_list(v)
            if len(v):
                entity['properties'][k] = v
        return entity

    def to_ref(self):
        return {
            'id': self.id,
            'label': self.name,
            'schema': self.type,
            'collection_id': self.collection_id
        }

    def __unicode__(self):
        return self.name

    def __repr__(self):
        return '<Entity(%r, %r)>' % (self.id, self.name)
Пример #18
0
class Role(db.Model, IdModel, SoftDeleteModel):
    """A user, group or other access control subject."""

    _schema = 'role.json#'
    __tablename__ = 'role'

    USER = '******'
    GROUP = 'group'
    SYSTEM = 'system'
    TYPES = [USER, GROUP, SYSTEM]

    SYSTEM_GUEST = 'guest'
    SYSTEM_USER = '******'

    foreign_id = db.Column(db.Unicode(2048), nullable=False, unique=True)
    name = db.Column(db.Unicode, nullable=False)
    email = db.Column(db.Unicode, nullable=True)
    api_key = db.Column(db.Unicode, nullable=True)
    is_admin = db.Column(db.Boolean, nullable=False, default=False)
    type = db.Column(db.Enum(*TYPES, name='role_type'), nullable=False)
    permissions = db.relationship("Permission", backref="role")

    def update(self, data):
        validate(data, self._schema)
        self.name = data.get('name', self.name)
        self.email = data.get('email', self.email)

    def clear_roles(self):
        self.roles = []
        db.session.add(self)

    def add_role(self, role):
        self.roles.append(role)
        db.session.add(role)
        db.session.add(self)

    @classmethod
    def notifiable(cls):
        return cls.all_ids().filter(cls.email != None)  # noqa

    @classmethod
    def by_foreign_id(cls, foreign_id):
        if foreign_id is not None:
            return cls.all().filter_by(foreign_id=foreign_id).first()

    @classmethod
    def by_api_key(cls, api_key):
        if api_key is not None:
            return cls.all().filter_by(api_key=api_key).first()

    @classmethod
    def load_or_create(cls, foreign_id, type, name, email=None, is_admin=None):
        role = cls.by_foreign_id(foreign_id)
        if role is None:
            role = cls()
            role.foreign_id = foreign_id
            role.name = name
            role.type = type
            role.is_admin = False

        if role.api_key is None:
            role.api_key = uuid4().hex

        role.email = email
        if is_admin is not None:
            role.is_admin = is_admin

        # see: https://github.com/pudo/aleph/issues/111
        auto_admins = get_config('AUTHZ_ADMINS') or ''
        auto_admins = [a.lower() for a in auto_admins.split(',')]
        if email is not None and email.lower() in auto_admins:
            role.is_admin = True

        db.session.add(role)
        db.session.flush()
        return role

    @classmethod
    def load_id(cls, foreign_id, type=None, name=None):
        """Load a role and return the ID.

        If type is given and no role is found, a new role will be created.
        """
        if not hasattr(current_app, '_authz_roles'):
            current_app._authz_roles = {}
        if foreign_id not in current_app._authz_roles:
            role = cls.by_foreign_id(foreign_id)
            if role is None:
                if type is None:
                    return
                name = name or foreign_id
                role = cls.load_or_create(foreign_id, type, name)
            current_app._authz_roles[foreign_id] = role.id
        return current_app._authz_roles[foreign_id]

    def __repr__(self):
        return '<Role(%r,%r)>' % (self.id, self.foreign_id)

    def __unicode__(self):
        return self.name

    def to_dict(self):
        data = super(Role, self).to_dict()
        data.update({
            'api_url': url_for('roles_api.view', id=self.id),
            'foreign_id': self.foreign_id,
            'is_admin': self.is_admin,
            'email': self.email,
            'name': self.name,
            'type': self.type
        })
        return data
Пример #19
0
class Export(db.Model, IdModel, DatedModel):
    """A data export run in the background. The data is stored in a cloud
    storage bucket and the user is given a link to download the data. The link
    expires after a fixed duration and the exported data is deleted."""

    DEFAULT_EXPIRATION = timedelta(days=30)  # After 30 days

    label = db.Column(db.Unicode)
    operation = db.Column(db.Unicode)
    creator_id = db.Column(db.Integer, db.ForeignKey("role.id"))
    creator = db.relationship(Role,
                              backref=db.backref("exports", lazy="dynamic"))
    collection_id = db.Column(db.Integer,
                              db.ForeignKey("collection.id"),
                              index=True,
                              nullable=True)
    collection = db.relationship(Collection,
                                 backref=db.backref("exports", lazy="dynamic"))

    expires_at = db.Column(db.DateTime, default=None, nullable=True)
    deleted = db.Column(db.Boolean, default=False)
    status = db.Column("export_status", db.Unicode, default=Status.DEFAULT)

    content_hash = db.Column(db.Unicode(65), index=True, nullable=True)
    file_size = db.Column(db.BigInteger, nullable=True)  # In bytes
    file_name = db.Column(db.Unicode, nullable=True)
    mime_type = db.Column(db.Unicode)
    meta = db.Column(JSONB, default={})

    def to_dict(self):
        data = self.to_dict_dates()
        data.update({
            "id": stringify(self.id),
            "label": self.label,
            "operation": self.operation,
            "creator_id": stringify(self.creator_id),
            "collection_id": self.collection_id,
            "expires_at": self.expires_at,
            "deleted": self.deleted,
            "status": Status.LABEL.get(self.status),
            "content_hash": self.content_hash,
            "file_size": self.file_size,
            "file_name": self.file_name,
            "mime_type": self.mime_type,
            "meta": self.meta,
        })
        return data

    @classmethod
    def create(cls,
               operation,
               role_id,
               label,
               collection=None,
               mime_type=None,
               meta=None):
        export = cls()
        export.creator_id = role_id
        export.operation = operation
        export.label = label
        if collection is not None:
            export.collection_id = collection.id
        export.mime_type = mime_type
        export.expires_at = datetime.utcnow() + cls.DEFAULT_EXPIRATION
        export.meta = meta or {}
        db.session.add(export)
        return export

    @property
    def namespace(self):
        return make_key("role", self.creator_id)

    def set_status(self, status):
        self.status = status
        db.session.add(self)

    def should_delete_publication(self):
        """Check whether the published export should be deleted from the archive

        Since we store exports by contenthash, there may be other non-expired exports
        that point to the same file in the archive"""
        q = (Export.all().filter(
            Export.content_hash == self.content_hash).filter(
                Export.deleted.isnot(True)).filter(Export.id != self.id))
        return q.first() is None

    @classmethod
    def get_expired(cls, deleted=False):
        now = datetime.utcnow()
        q = cls.all()
        q = q.filter(cls.expires_at <= now)
        if not deleted:
            q = q.filter(cls.deleted == deleted)
        return q

    @classmethod
    def get_pending(cls):
        q = cls.all()
        q = q.filter(cls.status == Status.PENDING)
        q = q.filter(cls.deleted == False)  # noqa
        return q

    @classmethod
    def by_id(cls, id, role_id=None, deleted=False):
        q = cls.all().filter_by(id=id)
        if role_id is not None:
            q = q.filter(cls.creator_id == role_id)
        if not deleted:
            q = q.filter(cls.deleted == False)  # noqa
        return q.first()

    @classmethod
    def by_role_id(cls, role_id, deleted=False):
        q = cls.all()
        q = q.filter(cls.creator_id == role_id)
        if not deleted:
            q = q.filter(cls.deleted == False)  # noqa
        q = q.order_by(cls.created_at.desc())
        return q

    @classmethod
    def by_content_hash(cls, content_hash, deleted=False):
        q = cls.all()
        q = q.filter(cls.content_hash == content_hash)
        if not deleted:
            q = q.filter(cls.deleted == False)  # noqa
        return q

    def __repr__(self):
        return "<Export(%r, %r, %r)>" % (self.id, self.creator_id, self.label)
Пример #20
0
class Document(db.Model, DatedModel):
    SCHEMA = 'Document'
    SCHEMA_FOLDER = 'Folder'
    SCHEMA_TABLE = 'Table'

    id = db.Column(db.BigInteger, primary_key=True)
    content_hash = db.Column(db.Unicode(65), nullable=True, index=True)
    foreign_id = db.Column(db.Unicode, unique=False, nullable=True, index=True)
    schema = db.Column(db.String(255), nullable=False)
    meta = db.Column(JSONB, default={})

    uploader_id = db.Column(db.Integer,
                            db.ForeignKey('role.id'),
                            nullable=True)  # noqa
    parent_id = db.Column(db.BigInteger,
                          db.ForeignKey('document.id'),
                          nullable=True,
                          index=True)  # noqa
    collection_id = db.Column(db.Integer,
                              db.ForeignKey('collection.id'),
                              nullable=False,
                              index=True)  # noqa
    collection = db.relationship(Collection,
                                 backref=db.backref('documents',
                                                    lazy='dynamic'))  # noqa

    def __init__(self, **kw):
        self.meta = {}
        super(Document, self).__init__(**kw)

    @property
    def model(self):
        return model.get(self.schema)

    @property
    def ancestors(self):
        if self.parent_id is None:
            return []
        key = cache.key('ancestors', self.id)
        ancestors = cache.get_list(key)
        if len(ancestors):
            return ancestors
        parent_key = cache.key('ancestors', self.parent_id)
        ancestors = cache.get_list(parent_key)
        if not len(ancestors):
            ancestors = []
            parent = Document.by_id(self.parent_id)
            if parent is not None:
                ancestors = parent.ancestors
        ancestors.append(self.parent_id)
        if self.model.is_a(model.get(self.SCHEMA_FOLDER)):
            cache.set_list(key, ancestors, expire=cache.EXPIRE)
        return ancestors

    def update(self, data):
        props = ('title', 'summary', 'author', 'crawler', 'source_url',
                 'file_name', 'mime_type', 'headers', 'date', 'authored_at',
                 'modified_at', 'published_at', 'retrieved_at', 'languages',
                 'countries', 'keywords')
        for prop in props:
            self.meta[prop] = data.get(prop, self.meta.get(prop))
        flag_modified(self, 'meta')

    def delete(self, deleted_at=None):
        db.session.delete(self)

    @classmethod
    def delete_by_collection(cls, collection_id):
        pq = db.session.query(cls)
        pq = pq.filter(cls.collection_id == collection_id)
        pq.delete(synchronize_session=False)

    @classmethod
    def save(cls,
             collection,
             parent=None,
             foreign_id=None,
             content_hash=None,
             meta=None,
             uploader_id=None):
        """Try and find a document by various criteria."""
        q = cls.all()
        q = q.filter(Document.collection_id == collection.id)

        if parent is not None:
            q = q.filter(Document.parent_id == parent.id)
        if foreign_id is not None:
            q = q.filter(Document.foreign_id == foreign_id)
        elif content_hash is not None:
            q = q.filter(Document.content_hash == content_hash)
        else:
            raise ValueError("No unique criterion for document.")

        document = q.first()
        if document is None:
            document = cls()
            document.schema = cls.SCHEMA
            document.collection_id = collection.id
            document.uploader_id = uploader_id

        if parent is not None:
            document.parent_id = parent.id

        if foreign_id is not None:
            document.foreign_id = foreign_id

        document.content_hash = content_hash
        if content_hash is None:
            document.schema = cls.SCHEMA_FOLDER

        if meta is not None:
            document.update(meta)

        db.session.add(document)
        return document

    @classmethod
    def by_id(cls, id, collection_id=None):
        try:
            id = int(id)
        except Exception:
            return
        q = cls.all()
        q = q.filter(cls.id == id)
        if collection_id is not None:
            q = q.filter(cls.collection_id == collection_id)
        return q.first()

    @classmethod
    def by_collection(cls, collection_id=None):
        q = cls.all()
        q = q.filter(cls.collection_id == collection_id)
        return q

    @classmethod
    def cleanup_deleted(cls):
        q = db.session.query(Collection.id)
        q = q.filter(Collection.deleted_at != None)  # noqa
        collection_ids = [c for (c, ) in q.all()]
        pq = db.session.query(cls)
        pq = pq.filter(cls.collection_id.in_(collection_ids))
        pq.delete(synchronize_session=False)

    def to_proxy(self):
        proxy = model.get_proxy({
            'id': str(self.id),
            'schema': self.model,
            'properties': {}
        })
        meta = dict(self.meta)
        headers = meta.pop('headers', {}) or {}
        headers = {slugify(k, sep='_'): v for k, v in headers.items()}
        proxy.set('contentHash', self.content_hash)
        proxy.set('parent', self.parent_id)
        proxy.set('ancestors', self.ancestors)
        proxy.set('crawler', meta.get('crawler'))
        proxy.set('sourceUrl', meta.get('source_url'))
        proxy.set('title', meta.get('title'))
        proxy.set('fileName', meta.get('file_name'))
        if not proxy.has('fileName'):
            disposition = headers.get('content_disposition')
            if disposition is not None:
                _, attrs = cgi.parse_header(disposition)
                proxy.set('fileName', attrs.get('filename'))
        proxy.set('mimeType', meta.get('mime_type'))
        if not proxy.has('mimeType'):
            proxy.set('mimeType', headers.get('content_type'))
        proxy.set('language', meta.get('languages'))
        proxy.set('country', meta.get('countries'))
        proxy.set('keywords', meta.get('keywords'))
        proxy.set('headers', registry.json.pack(headers), quiet=True)
        proxy.set('authoredAt', meta.get('authored_at'))
        proxy.set('modifiedAt', meta.get('modified_at'))
        proxy.set('publishedAt', meta.get('published_at'))
        proxy.set('retrievedAt', meta.get('retrieved_at'))
        proxy.set('indexUpdatedAt', self.created_at)
        proxy.set('sourceUrl', meta.get('source_url'))
        return proxy

    def __repr__(self):
        return '<Document(%r,%r)>' % (self.id, self.schema)
Пример #21
0
class Export(db.Model, IdModel, DatedModel):
    """A data export run in the background. The data is stored in a cloud
    storage bucket and the user is given a link to download the data. The link
    expires after a fixed duration and the exported data is deleted. """

    MAX_FILE_SIZE = 10 * 1024 * 1024 * 1024  # 10 GB
    STATUS_PENDING = "pending"
    STATUS_SUCCESSFUL = "successful"
    STATUS_FAILED = "failed"
    EXPORT_STATUS = {
        STATUS_PENDING: lazy_gettext("pending"),
        STATUS_SUCCESSFUL: lazy_gettext("successful"),
        STATUS_FAILED: lazy_gettext("failed"),
    }
    DEFAULT_STATUS = STATUS_PENDING
    DEFAULT_EXPIRATION = timedelta(days=30)  # After 30 days

    label = db.Column(db.Unicode)

    operation = db.Column(db.Unicode)

    creator_id = db.Column(db.Integer, db.ForeignKey("role.id"))
    creator = db.relationship(Role,
                              backref=db.backref("exports", lazy="dynamic"))
    collection_id = db.Column(db.Integer,
                              db.ForeignKey("collection.id"),
                              index=True,
                              nullable=True)
    collection = db.relationship(Collection,
                                 backref=db.backref("exports", lazy="dynamic"))

    expires_at = db.Column(db.DateTime, default=None, nullable=True)
    deleted = db.Column(db.Boolean, default=False)
    export_status = db.Column(db.Unicode, default=DEFAULT_STATUS)

    content_hash = db.Column(db.Unicode(65), index=True, nullable=True)
    file_size = db.Column(db.BigInteger, nullable=True)  # In bytes
    file_name = db.Column(db.Unicode, nullable=True)
    mime_type = db.Column(db.Unicode)
    meta = db.Column(JSONB, default={})

    def to_dict(self):
        data = self.to_dict_dates()
        if self.export_status in self.EXPORT_STATUS:
            data["export_status"] = self.EXPORT_STATUS.get(self.export_status)
        data.update({
            "id": stringify(self.id),
            "label": self.label,
            "operation": self.operation,
            "creator_id": stringify(self.creator_id),
            "collection_id": self.collection_id,
            "expires_at": self.expires_at,
            "deleted": self.deleted,
            "export_status": self.export_status,
            "content_hash": self.content_hash,
            "file_size": self.file_size,
            "file_name": self.file_name,
            "meta": self.meta,
        })
        return data

    @classmethod
    def create(
        cls,
        operation,
        role_id,
        label,
        file_path=None,
        expires_after=None,
        collection=None,
        mime_type=None,
    ):
        export = cls()
        export.creator_id = role_id
        export.operation = operation
        export.label = label
        if file_path is not None:
            export.set_filepath(file_path)
        if collection is not None:
            export.collection_id = collection.id
        export.mime_type = mime_type
        export.expires_at = datetime.utcnow() + (expires_after
                                                 or cls.DEFAULT_EXPIRATION)
        db.session.add(export)
        return export

    @property
    def namespace(self):
        return make_key("role", self.creator_id)

    def publish(self):
        if not self._file_path:
            raise RuntimeError("file path not present for export: %r", self)
        # Use contenthash as filename to make to ensure uniqueness
        path = Path(self._file_path.parent, self.content_hash)
        self._file_path.rename(path)
        try:
            archive.publish(self.namespace, path, self.mime_type)
            self.set_status(status=Export.STATUS_SUCCESSFUL)
        except Exception as ex:
            self.set_status(status=Export.STATUS_FAILED)
            raise ex

    def set_filepath(self, file_path):
        file_path = ensure_path(file_path)
        file_name = safe_filename(file_path)
        file_size = file_path.stat().st_size
        self.file_name = file_name
        self.file_size = file_size
        self._file_path = file_path
        self.content_hash = checksum(file_path)

    def set_status(self, status):
        if status in self.EXPORT_STATUS:
            self.export_status = status
            db.session.add(self)

    def delete_publication(self):
        if self._should_delete_publication():
            archive.delete_publication(self.namespace, self.content_hash)
        self.deleted = True
        db.session.add(self)

    def _should_delete_publication(self):
        """Check whether the published export should be deleted from the archive

        Since we store exports by contenthash, there may be other non-expired exports
        that point to the same file in the archive"""
        q = (Export.all().filter(
            Export.content_hash == self.content_hash).filter(
                Export.deleted.isnot(True)).filter(Export.id != self.id))
        return q.first() is None

    @classmethod
    def get_expired(cls, deleted=False):
        now = datetime.utcnow()
        q = cls.all().filter(
            cls.expires_at.isnot(None)).filter(cls.expires_at <= now)
        if deleted is not None:
            q = q.filter(cls.deleted == deleted)
        return q

    @classmethod
    def by_id(cls, id, role_id=None, deleted=False):
        q = cls.all().filter_by(id=id)
        if role_id is not None:
            q = q.filter(cls.creator_id == role_id)
        if not deleted:
            q = q.filter(cls.deleted == False)
        return q.first()

    @classmethod
    def by_role_id(cls, role_id, deleted=False):
        q = cls.all()
        q = q.filter(cls.creator_id == role_id)
        if not deleted:
            q = q.filter(cls.deleted == False)
        q = q.order_by(cls.created_at.desc())
        return q

    def __repr__(self):
        return "<Export(%r, %r)>" % (self.id, self.creator_id)
Пример #22
0
class Collection(db.Model, IdModel, SoftDeleteModel):
    """A set of documents and entities against which access control is
    enforced."""

    # Category schema for collections.
    # TODO: add extra weight info.
    # TODO: should this be configurable?
    CATEGORIES = {
        'news': lazy_gettext('News archives'),
        'leak': lazy_gettext('Leaks'),
        'land': lazy_gettext('Land registry'),
        'gazette': lazy_gettext('Gazettes'),
        'court': lazy_gettext('Court archives'),
        'company': lazy_gettext('Company registries'),
        'watchlist': lazy_gettext('Watchlists'),
        'investigation': lazy_gettext('Personal collections'),
        'sanctions': lazy_gettext('Sanctions lists'),
        'scrape': lazy_gettext('Scrapes'),
        'procurement': lazy_gettext('Procurement'),
        'grey': lazy_gettext('Grey literature'),
        'license': lazy_gettext('Licenses and concessions'),
        'regulatory': lazy_gettext('Regulatory filings'),
        'other': lazy_gettext('Other material')
    }

    DEFAULT = 'other'

    label = db.Column(db.Unicode)
    summary = db.Column(db.Unicode, nullable=True)
    category = db.Column(db.Unicode, nullable=True)
    countries = db.Column(ARRAY(db.Unicode()), default=[])
    languages = db.Column(ARRAY(db.Unicode()), default=[])
    foreign_id = db.Column(db.Unicode, unique=True, nullable=False)
    publisher = db.Column(db.Unicode, nullable=True)
    publisher_url = db.Column(db.Unicode, nullable=True)
    info_url = db.Column(db.Unicode, nullable=True)
    data_url = db.Column(db.Unicode, nullable=True)

    # A casefile is a type of collection which is used to manage the state
    # of an investigation. Unlike normal collections, cases do not serve
    # as source material, but as a mechanism of analysis.
    casefile = db.Column(db.Boolean, default=False)

    creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True)
    creator = db.relationship(Role)

    def update(self, data, creator=None):
        self.label = data.get('label', self.label)
        self.summary = data.get('summary', self.summary)
        self.summary = data.get('summary', self.summary)
        self.publisher = data.get('publisher', self.publisher)
        self.publisher_url = data.get('publisher_url', self.publisher_url)
        self.info_url = data.get('info_url', self.info_url)
        self.data_url = data.get('data_url', self.data_url)
        self.category = data.get('category') or self.DEFAULT
        self.casefile = as_bool(data.get('casefile'), default=False)
        self.countries = data.get('countries', [])
        self.languages = data.get('languages', [])
        if creator is None:
            creator = Role.by_id(data.get('creator_id'))
        self.creator = creator
        self.updated_at = datetime.utcnow()
        db.session.add(self)
        db.session.flush()
        if creator is not None:
            Permission.grant(self, creator, True, True)

    @property
    def roles(self):
        if not hasattr(self, '_roles'):
            q = db.session.query(Permission.role_id)
            q = q.filter(Permission.deleted_at == None)  # noqa
            q = q.filter(Permission.collection_id == self.id)  # noqa
            q = q.filter(Permission.read == True)  # noqa
            self._roles = [e.role_id for e in q.all()]
        return self._roles

    @property
    def kind(self):
        return 'casefile' if self.casefile else 'source'

    @classmethod
    def by_foreign_id(cls, foreign_id, deleted=False):
        if foreign_id is None:
            return
        q = cls.all(deleted=deleted)
        return q.filter(cls.foreign_id == foreign_id).first()

    @classmethod
    def all_by_ids(cls, ids, deleted=False, authz=None):
        q = super(Collection, cls).all_by_ids(ids, deleted=deleted)
        if authz is not None and not authz.is_admin:
            q = q.join(Permission, cls.id == Permission.collection_id)
            q = q.filter(Permission.deleted_at == None)  # noqa
            q = q.filter(Permission.read == True)  # noqa
            q = q.filter(Permission.role_id.in_(authz.roles))
        return q

    @classmethod
    def create(cls, data, role=None):
        foreign_id = data.get('foreign_id') or make_textid()
        collection = cls.by_foreign_id(foreign_id, deleted=True)
        if collection is None:
            collection = cls()
            collection.foreign_id = foreign_id
        collection.update(data, creator=role)
        collection.deleted_at = None
        return collection

    def __repr__(self):
        return '<Collection(%r, %r, %r)>' % \
            (self.id, self.foreign_id, self.label)
Пример #23
0
class Collection(db.Model, IdModel, SoftDeleteModel):
    """A set of documents and entities against which access control is
    enforced."""
    label = db.Column(db.Unicode)
    summary = db.Column(db.Unicode, nullable=True)
    category = db.Column(db.Unicode, nullable=True)
    countries = db.Column(ARRAY(db.Unicode()), default=[])
    languages = db.Column(ARRAY(db.Unicode()), default=[])
    foreign_id = db.Column(db.Unicode, unique=True, nullable=False)

    # Managed collections are generated by API crawlers and thus UI users
    # shouldn't be allowed to add entities or documents to them. They also
    # don't use advanced entity extraction features for performance reasons.
    managed = db.Column(db.Boolean, default=False)

    creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True)
    creator = db.relationship(Role)

    def update(self, data):
        self.label = data.get('label', self.label)
        self.summary = data.get('summary', self.summary)
        self.category = data.get('category', self.category)
        self.managed = data.get('managed', False)
        self.countries = data.get('countries', [])
        creator = data.get('creator') or {}
        self.update_creator(creator.get('id'))
        self.touch()

    def update_creator(self, role):
        """Set the creator (and admin) of a collection."""
        if not isinstance(role, Role):
            role = Role.by_id(role)
        if role is None or role.type != Role.USER:
            return
        self.creator = role
        db.session.add(self)
        db.session.flush()
        Permission.grant(self, role, True, True)

    def touch(self):
        self.updated_at = datetime.utcnow()
        db.session.add(self)

    def delete_matches(self):
        pq = db.session.query(Match)
        pq = pq.filter(
            or_(Match.collection_id == self.id,
                Match.match_collection_id == self.id))
        pq.delete(synchronize_session=False)

    def delete_permissions(self, deleted_at):
        pq = db.session.query(Permission)
        pq = pq.filter(Permission.collection_id == self.id)
        pq.update({Permission.deleted_at: deleted_at},
                  synchronize_session=False)

    def delete(self, deleted_at=None):
        self.delete_matches()
        self.delete_permissions(deleted_at=deleted_at)
        super(Collection, self).delete(deleted_at=deleted_at)

    @property
    def roles(self):
        if not hasattr(self, '_roles'):
            q = db.session.query(Permission.role_id)
            q = q.filter(Permission.collection_id == self.id)  # noqa
            q = q.filter(Permission.read == True)  # noqa
            self._roles = [e.role_id for e in q.all()]
        return self._roles

    @classmethod
    def by_foreign_id(cls, foreign_id, deleted=False):
        if foreign_id is None:
            return
        q = cls.all(deleted=deleted)
        return q.filter(cls.foreign_id == foreign_id).first()

    @classmethod
    def all_by_ids(cls, ids, deleted=False, authz=None):
        q = super(Collection, cls).all_by_ids(ids, deleted=deleted)
        if authz is not None and not authz.is_admin:
            q = q.join(Permission, cls.id == Permission.collection_id)
            q = q.filter(Permission.deleted_at == None)  # noqa
            q = q.filter(Permission.read == True)  # noqa
            q = q.filter(Permission.role_id.in_(authz.roles))
        return q

    @classmethod
    def create(cls, data, role=None):
        foreign_id = data.get('foreign_id') or make_textid()
        collection = cls.by_foreign_id(foreign_id, deleted=True)
        if collection is None:
            collection = cls()
            collection.foreign_id = foreign_id
            collection.update(data)
            collection.update_creator(role)
        collection.deleted_at = None
        return collection

    def __repr__(self):
        return '<Collection(%r, %r, %r)>' % \
            (self.id, self.foreign_id, self.label)
Пример #24
0
class Collection(db.Model, IdModel, SoftDeleteModel):
    """A set of documents and entities against which access control is
    enforced."""

    # Category schema for collections.
    # TODO: add extra weight info.
    # TODO: should this be configurable?
    CATEGORIES = {
        'news': 'News archives',
        'leak': 'Leaks',
        'land': 'Land registry',
        'gazette': 'Gazettes',
        'court': 'Court archives',
        'company': 'Company registries',
        'watchlist': 'Watchlists',
        'investigation': 'Personal collections',
        'sanctions': 'Sanctions lists',
        'scrape': 'Scrapes',
        'procurement': 'Procurement',
        'grey': 'Grey literature',
        'license': 'Licenses and concessions',
        'regulatory': 'Regulatory filings',
        'other': 'Other material'
    }

    label = db.Column(db.Unicode)
    summary = db.Column(db.Unicode, nullable=True)
    category = db.Column(db.Unicode, nullable=True)
    countries = db.Column(ARRAY(db.Unicode()), default=[])
    languages = db.Column(ARRAY(db.Unicode()), default=[])
    foreign_id = db.Column(db.Unicode, unique=True, nullable=False)

    # Managed collections are generated by API crawlers and thus UI users
    # shouldn't be allowed to add entities or documents to them. They also
    # don't use advanced entity extraction features for performance reasons.
    managed = db.Column(db.Boolean, default=False)

    creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True)
    creator = db.relationship(Role)

    def update(self, data, creator=None):
        self.label = data.get('label', self.label)
        self.summary = data.get('summary', self.summary)
        self.category = data.get('category', self.category)
        self.managed = data.get('managed', False)
        self.countries = data.get('countries', [])
        if creator is None:
            creator = Role.by_id(data.get('creator_id'))
        self.creator = creator
        self.updated_at = datetime.utcnow()
        db.session.add(self)
        db.session.flush()
        if creator is not None:
            Permission.grant(self, creator, True, True)

    @property
    def roles(self):
        if not hasattr(self, '_roles'):
            q = db.session.query(Permission.role_id)
            q = q.filter(Permission.deleted_at == None)  # noqa
            q = q.filter(Permission.collection_id == self.id)  # noqa
            q = q.filter(Permission.read == True)  # noqa
            self._roles = [e.role_id for e in q.all()]
        return self._roles

    @classmethod
    def by_foreign_id(cls, foreign_id, deleted=False):
        if foreign_id is None:
            return
        q = cls.all(deleted=deleted)
        return q.filter(cls.foreign_id == foreign_id).first()

    @classmethod
    def all_by_ids(cls, ids, deleted=False, authz=None):
        q = super(Collection, cls).all_by_ids(ids, deleted=deleted)
        if authz is not None and not authz.is_admin:
            q = q.join(Permission, cls.id == Permission.collection_id)
            q = q.filter(Permission.deleted_at == None)  # noqa
            q = q.filter(Permission.read == True)  # noqa
            q = q.filter(Permission.role_id.in_(authz.roles))
        return q

    @classmethod
    def create(cls, data, role=None):
        foreign_id = data.get('foreign_id') or make_textid()
        collection = cls.by_foreign_id(foreign_id, deleted=True)
        if collection is None:
            collection = cls()
            collection.foreign_id = foreign_id
            collection.update(data, creator=role)
        collection.deleted_at = None
        return collection

    def __repr__(self):
        return '<Collection(%r, %r, %r)>' % \
            (self.id, self.foreign_id, self.label)
Пример #25
0
class Document(db.Model, DatedModel, Metadata):
    MAX_TAGS = 10000

    SCHEMA = 'Document'
    SCHEMA_FOLDER = 'Folder'
    SCHEMA_PACKAGE = 'Package'
    SCHEMA_WORKBOOK = 'Workbook'
    SCHEMA_TEXT = 'PlainText'
    SCHEMA_HTML = 'HyperText'
    SCHEMA_PDF = 'Pages'
    SCHEMA_IMAGE = 'Image'
    SCHEMA_AUDIO = 'Audio'
    SCHEMA_VIDEO = 'Video'
    SCHEMA_TABLE = 'Table'
    SCHEMA_EMAIL = 'Email'

    STATUS_PENDING = 'pending'
    STATUS_SUCCESS = 'success'
    STATUS_FAIL = 'fail'

    id = db.Column(db.BigInteger, primary_key=True)
    content_hash = db.Column(db.Unicode(65), nullable=True, index=True)
    foreign_id = db.Column(db.Unicode, unique=False, nullable=True, index=True)
    schema = db.Column(db.String(255), nullable=False)
    status = db.Column(db.Unicode(10), nullable=True)
    meta = db.Column(JSONB, default={})
    error_message = db.Column(db.Unicode(), nullable=True)
    body_text = db.Column(db.Unicode(), nullable=True)
    body_raw = db.Column(db.Unicode(), nullable=True)

    uploader_id = db.Column(db.Integer,
                            db.ForeignKey('role.id'),
                            nullable=True)  # noqa
    parent_id = db.Column(db.BigInteger,
                          db.ForeignKey('document.id'),
                          nullable=True,
                          index=True)  # noqa
    children = db.relationship('Document',
                               lazy='dynamic',
                               backref=db.backref('parent',
                                                  uselist=False,
                                                  remote_side=[id]))  # noqa
    collection_id = db.Column(db.Integer,
                              db.ForeignKey('collection.id'),
                              nullable=False,
                              index=True)  # noqa
    collection = db.relationship(Collection,
                                 backref=db.backref('documents',
                                                    lazy='dynamic'))  # noqa

    def __init__(self, **kw):
        self.meta = {}
        super(Document, self).__init__(**kw)

    @property
    def model(self):
        return model.get(self.schema)

    @property
    def name(self):
        if self.title is not None:
            return self.title
        if self.file_name is not None:
            return self.file_name
        if self.source_url is not None:
            return self.source_url

    @property
    def supports_records(self):
        # Slightly unintuitive naming: this just checks the document type,
        # not if there actually are any records.
        return self.schema in [self.SCHEMA_PDF, self.SCHEMA_TABLE]

    @property
    def supports_pages(self):
        return self.schema == self.SCHEMA_PDF

    @property
    def supports_nlp(self):
        structural = [
            Document.SCHEMA,
            Document.SCHEMA_PACKAGE,
            Document.SCHEMA_FOLDER,
            Document.SCHEMA_WORKBOOK,
            Document.SCHEMA_VIDEO,
            Document.SCHEMA_AUDIO,
        ]
        return self.schema not in structural

    @property
    def ancestors(self):
        if self.parent_id is None:
            return []
        key = cache.key('ancestors', self.id)
        ancestors = cache.get_list(key)
        if ancestors is not None:
            return ancestors
        ancestors = self.parent.ancestors
        ancestors.append(self.parent_id)
        cache.set_list(key, ancestors)
        return ancestors

    def update(self, data):
        props = ('title', 'summary', 'author', 'crawler', 'source_url',
                 'file_name', 'mime_type', 'headers', 'date', 'authored_at',
                 'modified_at', 'published_at', 'retrieved_at', 'languages',
                 'countries', 'keywords')
        for prop in props:
            value = data.get(prop, self.meta.get(prop))
            setattr(self, prop, value)
        db.session.add(self)

    def update_meta(self):
        flag_modified(self, 'meta')

    def delete_records(self):
        pq = db.session.query(DocumentRecord)
        pq = pq.filter(DocumentRecord.document_id == self.id)
        pq.delete()
        db.session.flush()

    def delete_tags(self):
        pq = db.session.query(DocumentTag)
        pq = pq.filter(DocumentTag.document_id == self.id)
        pq.delete()
        db.session.flush()

    def delete(self, deleted_at=None):
        self.delete_records()
        self.delete_tags()
        db.session.delete(self)

    @classmethod
    def delete_by_collection(cls, collection_id, deleted_at=None):
        documents = db.session.query(cls.id)
        documents = documents.filter(cls.collection_id == collection_id)
        documents = documents.subquery()

        pq = db.session.query(DocumentRecord)
        pq = pq.filter(DocumentRecord.document_id.in_(documents))
        pq.delete(synchronize_session=False)

        pq = db.session.query(DocumentTag)
        pq = pq.filter(DocumentTag.document_id.in_(documents))
        pq.delete(synchronize_session=False)

        pq = db.session.query(cls)
        pq = pq.filter(cls.collection_id == collection_id)
        pq.delete(synchronize_session=False)

    def raw_texts(self):
        yield self.title
        yield self.file_name
        yield self.source_url
        yield self.summary
        yield self.author

        if self.status != self.STATUS_SUCCESS:
            return

        yield self.body_text
        if self.supports_records:
            # iterate over all the associated records.
            pq = db.session.query(DocumentRecord)
            pq = pq.filter(DocumentRecord.document_id == self.id)
            pq = pq.order_by(DocumentRecord.index.asc())
            for record in pq.yield_per(10000):
                yield from record.raw_texts()

    @property
    def texts(self):
        yield from filter_texts(self.raw_texts())

    @classmethod
    def by_keys(cls,
                parent_id=None,
                collection_id=None,
                foreign_id=None,
                content_hash=None):
        """Try and find a document by various criteria."""
        q = cls.all()
        q = q.filter(Document.collection_id == collection_id)

        if parent_id is not None:
            q = q.filter(Document.parent_id == parent_id)

        if foreign_id is not None:
            q = q.filter(Document.foreign_id == foreign_id)
        elif content_hash is not None:
            q = q.filter(Document.content_hash == content_hash)
        else:
            raise ValueError("No unique criterion for document.")

        document = q.first()
        if document is None:
            document = cls()
            document.schema = cls.SCHEMA
            document.collection_id = collection_id

        if parent_id is not None:
            document.parent_id = parent_id

        if foreign_id is not None:
            document.foreign_id = foreign_id

        if content_hash is not None:
            document.content_hash = content_hash

        db.session.add(document)
        return document

    @classmethod
    def by_id(cls, id, collection_id=None):
        if id is None:
            return
        q = cls.all()
        q = q.filter(cls.id == id)
        if collection_id is not None:
            q = q.filter(cls.collection_id == collection_id)
        return q.first()

    @classmethod
    def by_collection(cls, collection_id=None):
        q = cls.all()
        q = q.filter(cls.collection_id == collection_id)
        return q

    @classmethod
    def find_ids(cls, collection_id=None, failed_only=False):
        q = cls.all_ids()
        if collection_id is not None:
            q = q.filter(cls.collection_id == collection_id)
        if failed_only:
            q = q.filter(cls.status != cls.STATUS_SUCCESS)
        q = q.order_by(cls.id.asc())
        return q

    def to_proxy(self):
        meta = dict(self.meta)
        headers = meta.pop('headers', {})
        headers = {slugify(k, sep='_'): v for k, v in headers.items()}
        proxy = model.get_proxy({
            'id': str(self.id),
            'schema': self.model,
            'properties': meta
        })
        proxy.set('contentHash', self.content_hash)
        proxy.set('parent', self.parent_id)
        proxy.set('ancestors', self.ancestors)
        proxy.set('fileSize', meta.get('file_size'))
        proxy.set('fileName', meta.get('file_name'))
        if not proxy.has('fileName'):
            disposition = headers.get('content_disposition')
            if disposition is not None:
                _, attrs = cgi.parse_header(disposition)
                proxy.set('fileName', attrs.get('filename'))
        proxy.set('mimeType', meta.get('mime_type'))
        if not proxy.has('mimeType'):
            proxy.set('mimeType', headers.get('content_type'))
        proxy.set('language', meta.get('languages'))
        proxy.set('country', meta.get('countries'))
        proxy.set('authoredAt', meta.get('authored_at'))
        proxy.set('modifiedAt', meta.get('modified_at'))
        proxy.set('publishedAt', meta.get('published_at'))
        proxy.set('retrievedAt', meta.get('retrieved_at'))
        proxy.set('sourceUrl', meta.get('source_url'))
        proxy.set('messageId', meta.get('message_id'), quiet=True)
        proxy.set('inReplyTo', meta.get('in_reply_to'), quiet=True)
        proxy.set('bodyText', self.body_text, quiet=True)
        proxy.set('bodyHtml', self.body_raw, quiet=True)
        columns = meta.get('columns')
        proxy.set('columns', registry.json.pack(columns), quiet=True)
        proxy.set('headers', registry.json.pack(headers), quiet=True)

        pdf = 'application/pdf'
        if meta.get('extension') == 'pdf' or proxy.first('mimeType') == pdf:
            proxy.set('pdfHash', self.content_hash, quiet=True)
        proxy.add('pdfHash', meta.get('pdf_version'), quiet=True)

        q = db.session.query(DocumentTag)
        q = q.filter(DocumentTag.document_id == self.id)
        q = q.filter(DocumentTag.type.in_(DocumentTag.MAPPING.keys()))
        q = q.order_by(DocumentTag.weight.desc())
        q = q.limit(Document.MAX_TAGS)
        for tag in q.all():
            prop = DocumentTag.MAPPING.get(tag.type)
            if prop is not None:
                proxy.add(prop, tag.text)
        return proxy

    def to_dict(self):
        proxy = self.to_proxy()
        data = proxy.to_full_dict()
        data.update(self.to_dict_dates())
        data.update({
            'name': self.name,
            'status': self.status,
            'foreign_id': self.foreign_id,
            'document_id': self.id,
            'collection_id': self.collection_id,
            'error_message': self.error_message,
            'uploader_id': self.uploader_id,
            'bulk': False,
        })
        return data

    def __repr__(self):
        return '<Document(%r,%r,%r)>' % (self.id, self.schema, self.title)
Пример #26
0
class Document(db.Model, DatedModel):
    _schema = 'document.json#'

    SCHEMA = 'Document'

    TYPE_TEXT = 'text'
    TYPE_TABULAR = 'tabular'
    TYPE_OTHER = 'other'

    STATUS_PENDING = 'pending'
    STATUS_SUCCESS = 'success'
    STATUS_FAIL = 'fail'

    id = db.Column(db.BigInteger, primary_key=True)
    content_hash = db.Column(db.Unicode(65), nullable=False, index=True)
    foreign_id = db.Column(db.Unicode, unique=False, nullable=True)
    type = db.Column(db.Unicode(10), nullable=False, index=True)
    status = db.Column(db.Unicode(10), nullable=True, index=True)
    _meta = db.Column('meta', JSONB)

    crawler = db.Column(db.Unicode(), index=True)
    crawler_run = db.Column(db.Unicode())
    error_type = db.Column(db.Unicode(), nullable=True)
    error_message = db.Column(db.Unicode(), nullable=True)
    error_details = db.Column(db.Unicode(), nullable=True)

    collection_id = db.Column(db.Integer,
                              db.ForeignKey('collection.id'),
                              nullable=False,
                              index=True)  # noqa
    collection = db.relationship(Collection,
                                 backref=db.backref('documents',
                                                    lazy='dynamic'))  # noqa

    @property
    def title(self):
        return self.meta.title

    @hybrid_property
    def meta(self):
        self._meta = self._meta or {}
        self._meta['content_hash'] = self.content_hash
        self._meta['foreign_id'] = self.foreign_id
        self._meta['crawler'] = self.crawler
        self._meta['crawler_run'] = self.crawler_run
        return Metadata.from_data(self._meta or {})

    @meta.setter
    def meta(self, meta):
        if isinstance(meta, Metadata):
            self.content_hash = meta.content_hash
            self.foreign_id = meta.foreign_id
            self.crawler = meta.crawler
            self.crawler_run = meta.crawler_run
            meta = meta.to_attr_dict()
        self._meta = meta
        flag_modified(self, '_meta')

    def update(self, data):
        validate(data, self._schema)
        meta = self.meta
        meta.update(data, safe=True)
        self.meta = meta
        db.session.add(self)

    def delete_pages(self):
        pq = db.session.query(DocumentPage)
        pq = pq.filter(DocumentPage.document_id == self.id)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def delete_records(self):
        pq = db.session.query(DocumentRecord)
        pq = pq.filter(DocumentRecord.document_id == self.id)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def delete_references(self, origin=None):
        pq = db.session.query(Reference)
        pq = pq.filter(Reference.document_id == self.id)
        if origin is not None:
            pq = pq.filter(Reference.origin == origin)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def delete(self, deleted_at=None):
        self.delete_references()
        self.delete_records()
        self.delete_pages()
        db.session.delete(self)

    def insert_records(self, sheet, iterable, chunk_size=1000):
        chunk = []
        for i, data in enumerate(iterable):
            chunk.append({
                'document_id': self.id,
                'row_id': i,
                'sheet': sheet,
                'data': data
            })
            if len(chunk) >= chunk_size:
                db.session.bulk_insert_mappings(DocumentRecord, chunk)
                chunk = []

        if len(chunk):
            db.session.bulk_insert_mappings(DocumentRecord, chunk)

    def text_parts(self):
        if self.type == self.TYPE_TEXT:
            for page in self.pages:
                for text in page.text_parts():
                    yield text
        elif self.type == self.TYPE_TABULAR:
            for record in self.records:
                for text in record.text_parts():
                    yield text

    @classmethod
    def crawler_last_run(cls, crawler_id):
        q = db.session.query(func.max(cls.updated_at))
        q = q.filter(cls.crawler == crawler_id)
        return q.scalar()

    @classmethod
    def is_crawler_active(cls, crawler_id):
        # TODO: add a function to see if a particular crawl is still running
        # this should be defined as having "pending" documents.
        last_run_time = cls.crawler_last_run(crawler_id)
        if last_run_time is None:
            return False
        return last_run_time > (datetime.utcnow() - timedelta(hours=1))

    @classmethod
    def crawler_stats(cls, crawler_id):
        # Check if the crawler was active very recently, if so, don't
        # allow the user to execute a new run right now.
        stats = {
            'updated': cls.crawler_last_run(crawler_id),
            'running': cls.is_crawler_active(crawler_id)
        }

        q = db.session.query(cls.status, func.count(cls.id))
        q = q.filter(cls.crawler == crawler_id)
        q = q.group_by(cls.status)
        for (status, count) in q.all():
            stats[status] = count
        return stats

    def _add_to_dict(self, data):
        try:
            from flask import request
            source_id = self.collection_id
            data['public'] = request.authz.collection_public(source_id)
        except:
            data['public'] = None
        data.update({
            'id': self.id,
            'type': self.type,
            'status': self.status,
            'error_type': self.error_type,
            'error_message': self.error_message,
            'error_details': self.error_details,
            'collection_id': self.collection_id,
            'created_at': self.created_at,
            'updated_at': self.updated_at
        })
        return data

    def to_dict(self):
        data = self.meta.to_dict()
        return self._add_to_dict(data)

    def to_index_dict(self):
        data = self.meta.to_index_dict()
        data['text'] = index_form(self.text_parts())
        data['schema'] = self.SCHEMA
        data['schemata'] = [self.SCHEMA]
        data['name_sort'] = ascii_text(data.get('title'))
        data['title_latin'] = ascii_text(data.get('title'))
        data['summary_latin'] = ascii_text(data.get('summary'))
        return self._add_to_dict(data)

    def __repr__(self):
        return '<Document(%r,%r,%r)>' % (self.id, self.type, self.title)
Пример #27
0
class Role(db.Model, IdModel, SoftDeleteModel):
    """A user, group or other access control subject."""
    __tablename__ = 'role'

    USER = '******'
    GROUP = 'group'
    SYSTEM = 'system'
    TYPES = [USER, GROUP, SYSTEM]

    SYSTEM_GUEST = 'guest'
    SYSTEM_USER = '******'

    #: Generates URL-safe signatures for invitations.
    SIGNATURE = URLSafeTimedSerializer(settings.SECRET_KEY)

    #: Signature maximum age, defaults to 1 day
    SIGNATURE_MAX_AGE = 60 * 60 * 24

    foreign_id = db.Column(db.Unicode(2048), nullable=False, unique=True)
    name = db.Column(db.Unicode, nullable=False)
    email = db.Column(db.Unicode, nullable=True)
    type = db.Column(db.Enum(*TYPES, name='role_type'), nullable=False)
    api_key = db.Column(db.Unicode, nullable=True)
    is_admin = db.Column(db.Boolean, nullable=False, default=False)
    is_muted = db.Column(db.Boolean, nullable=False, default=False)
    is_tester = db.Column(db.Boolean, nullable=False, default=False)
    is_blocked = db.Column(db.Boolean, nullable=False, default=False)
    password_digest = db.Column(db.Unicode, nullable=True)
    password = None
    reset_token = db.Column(db.Unicode, nullable=True)
    locale = db.Column(db.Unicode, nullable=True)

    permissions = db.relationship('Permission', backref='role')

    @property
    def has_password(self):
        return self.password_digest is not None

    @property
    def is_public(self):
        return self.id in self.public_roles()

    @property
    def is_alertable(self):
        if self.email is None:
            return False
        if self.is_muted is True:
            return False
        # TODO: ignore people that have not logged in for a certain time?
        return True

    @property
    def label(self):
        return anonymize_email(self.name, self.email)

    def update(self, data):
        self.name = data.get('name', self.name)
        self.is_muted = data.get('is_muted', self.is_muted)
        self.is_tester = data.get('is_tester', self.is_tester)
        if data.get('password'):
            self.set_password(data.get('password'))
        self.locale = stringify(data.get('locale', self.locale))
        self.updated_at = datetime.utcnow()

    def clear_roles(self):
        """Removes any existing roles from group membership."""
        self.roles = []
        self.updated_at = datetime.utcnow()
        db.session.add(self)
        db.session.flush()

    def add_role(self, role):
        """Adds an existing role as a membership of a group."""
        self.roles.append(role)
        db.session.add(role)
        db.session.add(self)
        self.updated_at = datetime.utcnow()

    def to_dict(self):
        data = self.to_dict_dates()
        data.update({
            'id': stringify(self.id),
            'type': self.type,
            'name': self.name,
            'label': self.label,
            'email': self.email,
            'locale': self.locale,
            'api_key': self.api_key,
            'is_admin': self.is_admin,
            'is_muted': self.is_muted,
            'is_tester': self.is_tester,
            'has_password': self.has_password,
            # 'notified_at': self.notified_at
        })
        return data

    @classmethod
    def by_foreign_id(cls, foreign_id):
        if foreign_id is not None:
            return cls.all().filter_by(foreign_id=foreign_id).first()

    @classmethod
    def by_email(cls, email):
        if email is None:
            return None
        q = cls.all()
        q = q.filter(func.lower(cls.email) == email.lower())
        return q.first()

    @classmethod
    def by_api_key(cls, api_key):
        if api_key is not None:
            return cls.all().filter_by(api_key=api_key).first()

    @classmethod
    def load_or_create(cls, foreign_id, type, name, email=None, is_admin=None):
        role = cls.by_foreign_id(foreign_id)

        if role is None:
            role = cls()
            role.foreign_id = foreign_id
            role.name = name or email
            role.type = type
            role.is_admin = False
            role.is_muted = False
            role.is_tester = False
            role.is_blocked = False
            role.notified_at = datetime.utcnow()

        if role.api_key is None:
            role.api_key = make_textid()

        if email is not None:
            role.email = email

        if is_admin is not None:
            role.is_admin = is_admin

        # see: https://github.com/alephdata/aleph/issues/111
        auto_admins = [a.lower() for a in settings.ADMINS]
        if email is not None and email.lower() in auto_admins:
            role.is_admin = True

        db.session.add(role)
        db.session.flush()
        return role

    @classmethod
    def load_cli_user(cls):
        return cls.load_or_create(foreign_id=settings.SYSTEM_USER,
                                  name='Aleph',
                                  type=cls.USER,
                                  is_admin=True)

    @classmethod
    def load_id(cls, foreign_id):
        """Load a role and return the ID."""
        if not hasattr(settings, '_roles'):
            settings._roles = {}
        if foreign_id not in settings._roles:
            role_id = cls.all_ids().filter_by(foreign_id=foreign_id).first()
            if role_id is not None:
                settings._roles[foreign_id] = role_id[0]
        return settings._roles.get(foreign_id)

    @classmethod
    def public_roles(cls):
        """Roles which make a collection to be considered public."""
        return set([
            cls.load_id(cls.SYSTEM_USER),
            cls.load_id(cls.SYSTEM_GUEST),
        ])

    @classmethod
    def by_prefix(cls, prefix, exclude=[]):
        """Load a list of roles matching a name, email address, or foreign_id.

        :param str pattern: Pattern to match.
        """
        query = prefix.replace('%', ' ').replace('_', ' ')
        query = '%%%s%%' % query
        q = cls.all()
        q = q.filter(Role.type == Role.USER)
        if len(exclude):
            q = q.filter(not_(Role.id.in_(exclude)))
        q = q.filter(
            or_(
                func.lower(cls.email) == prefix.lower(),
                cls.name.ilike(query)))
        q = q.order_by(Role.id.asc())
        return q

    @classmethod
    def all_groups(cls, authz):
        q = cls.all()
        q = q.filter(Role.type == Role.GROUP)
        q = q.order_by(Role.name.asc())
        q = q.order_by(Role.foreign_id.asc())
        if not authz.is_admin:
            q = q.filter(Role.id.in_(authz.roles))
        return q

    @classmethod
    def all_users(cls):
        return cls.all().filter(Role.type == Role.USER)

    @classmethod
    def all_system(cls):
        return cls.all().filter(Role.type == Role.SYSTEM)

    def set_password(self, secret):
        """Hashes and sets the role password.

        :param str secret: The password to be set.
        """
        self.password_digest = generate_password_hash(secret)

    def check_password(self, secret):
        """Checks the password if it matches the role password hash.

        :param str secret: The password to be checked.
        :rtype: bool
        """
        digest = self.password_digest or ''
        return check_password_hash(digest, secret)

    def __repr__(self):
        return '<Role(%r,%r)>' % (self.id, self.foreign_id)
Пример #28
0
class Entity(db.Model, UuidModel, SoftDeleteModel):
    THING = 'Thing'

    name = db.Column(db.Unicode)
    schema = db.Column(db.String(255), index=True)
    foreign_ids = db.Column(ARRAY(db.Unicode()))
    data = db.Column('data', JSONB)

    collection_id = db.Column(db.Integer,
                              db.ForeignKey('collection.id'),
                              index=True)  # noqa
    collection = db.relationship(Collection,
                                 backref=db.backref('entities',
                                                    lazy='dynamic'))  # noqa

    @property
    def model(self):
        return model.get(self.schema)

    @property
    def terms(self):
        terms = set([self.name])
        for alias in ensure_list(self.data.get('alias')):
            if alias is not None and len(alias):
                terms.add(alias)
        return terms

    @property
    def regex_terms(self):
        # This is to find the shortest possible regex for each entity.
        # If, for example, and entity matches both "Al Qaeda" and
        # "Al Qaeda in Iraq, Syria and the Levant", it is useless to
        # search for the latter.
        terms = set([match_form(t) for t in self.terms])
        regex_terms = set()
        for term in terms:
            if term is None or len(term) < 4 or len(term) > 120:
                continue
            contained = False
            for other in terms:
                if other is None or other == term:
                    continue
                if other in term:
                    contained = True
            if not contained:
                regex_terms.add(term)
        return regex_terms

    def delete_matches(self):
        pq = db.session.query(Match)
        pq = pq.filter(
            or_(Match.entity_id == self.id, Match.match_id == self.id))
        pq.delete(synchronize_session=False)
        db.session.refresh(self)

    def delete(self, deleted_at=None):
        self.delete_matches()
        deleted_at = deleted_at or datetime.utcnow()
        for alert in self.alerts:
            alert.delete(deleted_at=deleted_at)
        super(Entity, self).delete(deleted_at=deleted_at)

    @classmethod
    def delete_by_collection(cls, collection_id, deleted_at=None):
        from aleph.model import Alert
        deleted_at = deleted_at or datetime.utcnow()

        entities = db.session.query(cls.id)
        entities = entities.filter(cls.collection_id == collection_id)
        entities = entities.subquery()

        pq = db.session.query(Alert)
        pq = pq.filter(Alert.entity_id.in_(entities))
        pq.update({Alert.deleted_at: deleted_at}, synchronize_session=False)

        pq = db.session.query(Match)
        pq = pq.filter(Match.entity_id.in_(entities))
        pq.delete(synchronize_session=False)

        pq = db.session.query(Match)
        pq = pq.filter(Match.match_id.in_(entities))
        pq.delete(synchronize_session=False)

        pq = db.session.query(cls)
        pq = pq.filter(cls.collection_id == collection_id)
        pq = pq.filter(cls.deleted_at == None)  # noqa
        pq.update({cls.deleted_at: deleted_at}, synchronize_session=False)

    def merge(self, other):
        if self.id == other.id:
            raise ValueError("Cannot merge an entity with itself.")
        if self.collection_id != other.collection_id:
            raise ValueError(
                "Cannot merge entities from different collections.")  # noqa

        self.schema = model.precise_schema(self.schema, other.schema)
        self.foreign_ids = string_set(self.foreign_ids, self.foreign_ids)
        self.created_at = min((self.created_at, other.created_at))
        self.updated_at = datetime.utcnow()

        data = merge_data(self.data, other.data)
        if self.name != other.name:
            data = merge_data(data, {'alias': [other.name]})
        self.data = data

        # update alerts
        from aleph.model.alert import Alert
        q = db.session.query(Alert).filter(Alert.entity_id == other.id)
        q.update({Alert.entity_id: self.id})

        # delete source entities
        other.delete()
        db.session.add(self)
        db.session.commit()
        db.session.refresh(other)

    def update(self, entity):
        self.schema = entity.get('schema')

        data = entity.get('properties')
        if is_mapping(data):
            data['name'] = [entity.get('name')]
            self.data = self.model.validate(data)
        elif self.data is None:
            self.data = {}

        self.data.pop('name', None)
        self.name = entity.get('name')

        # TODO: should this be mutable?
        # self.foreign_ids = string_set(entity.get('foreign_ids'))
        self.updated_at = datetime.utcnow()
        db.session.add(self)

    @classmethod
    def create(cls, data, collection):
        foreign_ids = string_set(data.get('foreign_ids'))
        ent = cls.by_foreign_ids(foreign_ids, collection.id, deleted=True)
        if ent is None:
            ent = cls()
            ent.id = make_textid()
            ent.collection = collection
            ent.foreign_ids = foreign_ids
        ent.update(data)
        ent.deleted_at = None
        return ent

    @classmethod
    def by_foreign_ids(cls, foreign_ids, collection_id, deleted=False):
        if not len(foreign_ids):
            return None
        q = cls.all(deleted=deleted)
        q = q.filter(Entity.collection_id == collection_id)
        foreign_id = func.cast(foreign_ids, ARRAY(db.Unicode()))
        q = q.filter(cls.foreign_ids.contains(foreign_id))
        q = q.order_by(Entity.deleted_at.desc().nullsfirst())
        return q.first()

    @classmethod
    def all_ids(cls, deleted=False, authz=None):
        q = super(Entity, cls).all_ids(deleted=deleted)
        if authz is not None and not authz.is_admin:
            q = q.join(Permission,
                       cls.collection_id == Permission.collection_id)
            q = q.filter(Permission.deleted_at == None)  # noqa
            q = q.filter(Permission.read == True)  # noqa
            q = q.filter(Permission.role_id.in_(authz.roles))
        return q

    @classmethod
    def latest(cls):
        q = db.session.query(func.max(cls.updated_at))
        q = q.filter(cls.deleted_at == None)  # noqa
        return q.scalar()

    def __repr__(self):
        return '<Entity(%r, %r)>' % (self.id, self.name)
Пример #29
0
class Collection(db.Model, IdModel, SoftDeleteModel):
    """A set of documents and entities against which access control is
    enforced."""

    # Category schema for collections.
    # TODO: should this be configurable?
    CATEGORIES = {
        'news': lazy_gettext('News archives'),
        'leak': lazy_gettext('Leaks'),
        'land': lazy_gettext('Land registry'),
        'gazette': lazy_gettext('Gazettes'),
        'court': lazy_gettext('Court archives'),
        'company': lazy_gettext('Company registries'),
        'sanctions': lazy_gettext('Sanctions lists'),
        'procurement': lazy_gettext('Procurement'),
        'finance': lazy_gettext('Financial records'),
        'grey': lazy_gettext('Grey literature'),
        'library': lazy_gettext('Document libraries'),
        'license': lazy_gettext('Licenses and concessions'),
        'regulatory': lazy_gettext('Regulatory filings'),
        'poi': lazy_gettext('Persons of interest'),
        'customs': lazy_gettext('Customs declarations'),
        'census': lazy_gettext('Population census'),
        'transport': lazy_gettext('Air and maritime registers'),
        'other': lazy_gettext('Other material')
    }

    DEFAULT = 'other'

    label = db.Column(db.Unicode)
    summary = db.Column(db.Unicode, nullable=True)
    category = db.Column(db.Unicode, nullable=True)
    countries = db.Column(ARRAY(db.Unicode()), default=[])
    languages = db.Column(ARRAY(db.Unicode()), default=[])
    foreign_id = db.Column(db.Unicode, unique=True, nullable=False)
    publisher = db.Column(db.Unicode, nullable=True)
    publisher_url = db.Column(db.Unicode, nullable=True)
    info_url = db.Column(db.Unicode, nullable=True)
    data_url = db.Column(db.Unicode, nullable=True)

    # A casefile is a type of collection which is used to manage the state
    # of an investigation. Unlike normal collections, cases do not serve
    # as source material, but as a mechanism of analysis.
    casefile = db.Column(db.Boolean, default=False)

    creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True)
    creator = db.relationship(Role)

    def touch(self):
        # https://www.youtube.com/watch?v=wv-34w8kGPM
        self.updated_at = datetime.utcnow()
        db.session.add(self)

    def update(self, data, authz):
        self.label = data.get('label', self.label)
        self.summary = data.get('summary', self.summary)
        self.summary = data.get('summary', self.summary)
        self.publisher = data.get('publisher', self.publisher)
        self.publisher_url = data.get('publisher_url', self.publisher_url)
        self.info_url = data.get('info_url', self.info_url)
        self.data_url = data.get('data_url', self.data_url)
        self.countries = ensure_list(data.get('countries', self.countries))
        self.languages = ensure_list(data.get('languages', self.languages))

        # Some fields are editable only by admins in order to have
        # a strict separation between source evidence and case
        # material.
        if authz.is_admin:
            self.category = data.get('category', self.category)
            self.casefile = as_bool(data.get('casefile'),
                                    default=self.casefile)
            creator = Role.by_id(data.get('creator_id'))
            if creator is not None:
                self.creator = creator

        self.touch()
        db.session.flush()
        if self.creator is not None:
            Permission.grant(self, self.creator, True, True)

    @property
    def team_id(self):
        role = aliased(Role)
        perm = aliased(Permission)
        q = db.session.query(role.id)
        q = q.filter(role.type != Role.SYSTEM)
        q = q.filter(role.id == perm.role_id)
        q = q.filter(perm.collection_id == self.id)
        q = q.filter(perm.read == True)  # noqa
        q = q.filter(role.deleted_at == None)  # noqa
        q = q.filter(perm.deleted_at == None)  # noqa
        return [stringify(i) for (i, ) in q.all()]

    @property
    def secret(self):
        q = db.session.query(Permission.id)
        q = q.filter(Permission.role_id.in_(Role.public_roles()))
        q = q.filter(Permission.collection_id == self.id)
        q = q.filter(Permission.read == True)  # noqa
        q = q.filter(Permission.deleted_at == None)  # noqa
        return q.count() < 1

    @property
    def ns(self):
        if not hasattr(self, '_ns'):
            self._ns = Namespace(self.foreign_id)
        return self._ns

    def to_dict(self):
        data = self.to_dict_dates()
        data['category'] = self.DEFAULT
        if self.category in self.CATEGORIES:
            data['category'] = self.category
        data['kind'] = 'casefile' if self.casefile else 'source'
        data.update({
            'id': stringify(self.id),
            'collection_id': stringify(self.id),
            'foreign_id': self.foreign_id,
            'creator_id': stringify(self.creator_id),
            'team_id': self.team_id,
            'label': self.label,
            'summary': self.summary,
            'publisher': self.publisher,
            'publisher_url': self.publisher_url,
            'info_url': self.info_url,
            'data_url': self.data_url,
            'casefile': self.casefile,
            'secret': self.secret
        })
        return data

    @classmethod
    def by_foreign_id(cls, foreign_id, deleted=False):
        if foreign_id is None:
            return
        q = cls.all(deleted=deleted)
        return q.filter(cls.foreign_id == foreign_id).first()

    @classmethod
    def _apply_authz(cls, q, authz):
        if authz is not None and not authz.is_admin:
            q = q.join(Permission, cls.id == Permission.collection_id)
            q = q.filter(Permission.deleted_at == None)  # noqa
            q = q.filter(Permission.read == True)  # noqa
            q = q.filter(Permission.role_id.in_(authz.roles))
        return q

    @classmethod
    def all_authz(cls, authz, deleted=False):
        q = super(Collection, cls).all(deleted=deleted)
        return cls._apply_authz(q, authz)

    @classmethod
    def all_by_ids(cls, ids, deleted=False, authz=None):
        q = super(Collection, cls).all_by_ids(ids, deleted=deleted)
        return cls._apply_authz(q, authz)

    @classmethod
    def create(cls, data, authz, created_at=None):
        foreign_id = data.get('foreign_id') or make_textid()
        collection = cls.by_foreign_id(foreign_id, deleted=True)
        if collection is None:
            collection = cls()
            collection.created_at = created_at
            collection.foreign_id = foreign_id
            collection.category = cls.DEFAULT
            collection.casefile = True
            collection.creator_id = authz.id
        collection.update(data, authz)
        collection.deleted_at = None
        return collection

    def __repr__(self):
        fmt = '<Collection(%r, %r, %r)>'
        return fmt % (self.id, self.foreign_id, self.label)
Пример #30
0
class Document(db.Model, DatedModel):
    _schema = 'document.json#'

    TYPE_TEXT = 'text'
    TYPE_TABULAR = 'tabular'
    TYPE_OTHER = 'other'

    id = db.Column(db.BigInteger, primary_key=True)
    content_hash = db.Column(db.Unicode(65), nullable=False, index=True)
    foreign_id = db.Column(db.Unicode, unique=False, nullable=True)
    type = db.Column(db.Unicode(10), nullable=False, index=True)
    _meta = db.Column('meta', JSONB)

    collections = db.relationship(
        Collection,
        secondary=collection_document_table,  # noqa
        backref=db.backref('documents', lazy='dynamic'))  # noqa
    source_collection_id = db.Column(db.Integer,
                                     db.ForeignKey('collection.id'),
                                     nullable=True)  # noqa
    source_collection = db.relationship(Collection)

    @property
    def title(self):
        return self.meta.title

    @hybrid_property
    def meta(self):
        self._meta = self._meta or {}
        self._meta['content_hash'] = self.content_hash
        self._meta['foreign_id'] = self.foreign_id
        return Metadata.from_data(self._meta or {})

    @meta.setter
    def meta(self, meta):
        if isinstance(meta, Metadata):
            self.content_hash = meta.content_hash
            self.foreign_id = meta.foreign_id
            meta = meta.to_attr_dict()
        self._meta = meta
        flag_modified(self, '_meta')

    def update(self, data, writeable):
        validate(data, self._schema)
        collection_id = data.pop('collection_id', [])
        self.update_collections(collection_id, writeable)
        meta = self.meta
        meta.update(data, safe=True)
        self.meta = meta
        db.session.add(self)

    def update_collections(self, collection_id, writeable):
        for coll in self.collections:
            if coll.id == self.source_collection_id:
                continue
            if coll.id not in collection_id and coll.id in writeable:
                self.collections.remove(coll)
        for coll_id in collection_id:
            if coll_id in writeable:
                coll = Collection.by_id(coll_id)
                if coll not in self.collections:
                    self.collections.append(coll)
        db.session.add(self)

    def delete_pages(self):
        pq = db.session.query(DocumentPage)
        pq = pq.filter(DocumentPage.document_id == self.id)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def delete_records(self):
        pq = db.session.query(DocumentRecord)
        pq = pq.filter(DocumentRecord.document_id == self.id)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def delete_references(self, origin=None):
        pq = db.session.query(Reference)
        pq = pq.filter(Reference.document_id == self.id)
        if origin is not None:
            pq = pq.filter(Reference.origin == origin)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def delete(self, deleted_at=None):
        self.delete_references()
        self.delete_records()
        self.delete_pages()
        db.session.delete(self)

    def insert_records(self, sheet, iterable, chunk_size=1000):
        chunk = []
        for i, data in enumerate(iterable):
            chunk.append({
                'document_id': self.id,
                'row_id': i,
                'sheet': sheet,
                'data': data
            })
            if len(chunk) >= chunk_size:
                db.session.bulk_insert_mappings(DocumentRecord, chunk)
                chunk = []

        if len(chunk):
            db.session.bulk_insert_mappings(DocumentRecord, chunk)

    def text_parts(self):
        if self.type == self.TYPE_TEXT:
            for page in self.pages:
                for text in page.text_parts():
                    yield text
        elif self.type == self.TYPE_TABULAR:
            for record in self.records:
                for text in record.text_parts():
                    yield text

    @classmethod
    def get_max_id(cls):
        q = db.session.query(func.max(cls.id))
        return q.scalar()

    def __repr__(self):
        return '<Document(%r,%r,%r)>' % (self.id, self.type, self.meta.title)

    @property
    def collection_ids(self):
        collection_ids = [c.id for c in self.collections]
        if self.source_collection_id not in collection_ids:
            if self.source_collection_id is not None:
                collection_ids.append(self.source_collection_id)
        return collection_ids

    def _add_to_dict(self, data):
        collection_ids = self.collection_ids
        try:
            from aleph.authz import collections_public
            data['public'] = collections_public(collection_ids)
        except:
            pass
        data.update({
            'id': self.id,
            'type': self.type,
            'source_collection_id': self.source_collection_id,
            'collection_id': collection_ids,
            'created_at': self.created_at,
            'updated_at': self.updated_at
        })
        return data

    def to_dict(self):
        data = self.meta.to_dict()
        return self._add_to_dict(data)

    def to_index_dict(self):
        data = self.meta.to_index_dict()
        return self._add_to_dict(data)