Пример #1
0
class DocumentPage(db.Model):

    id = db.Column(db.BigInteger, primary_key=True)
    number = db.Column(db.Integer(), nullable=False)
    text = db.Column(db.Unicode(), nullable=False)
    document_id = db.Column(db.Integer(), db.ForeignKey('document.id'))
    document = db.relationship(
        Document, backref=db.backref('pages',
                                     cascade='all, delete-orphan'))  # noqa

    def __repr__(self):
        return '<DocumentPage(%r,%r)>' % (self.document_id, self.number)

    def text_parts(self):
        """Utility method to get all text snippets in a record."""
        if self.text is not None and len(self.text):
            yield self.text

    def to_dict(self):
        return {
            'id': self.id,
            'number': self.number,
            'text': self.text,
            'document_id': self.document_id
        }
Пример #2
0
class DocumentPage(db.Model):

    id = db.Column(db.BigInteger, primary_key=True)
    number = db.Column(db.Integer(), nullable=False)
    text = db.Column(db.Unicode(), nullable=False)
    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        Document, backref=db.backref('pages',
                                     cascade='all, delete-orphan'))  # noqa

    @property
    def tid(self):
        tid = sha1(str(self.document_id))
        tid.update(str(self.id))
        return tid.hexdigest()

    def __repr__(self):
        return '<DocumentPage(%r,%r)>' % (self.document_id, self.number)

    def text_parts(self):
        """Utility method to get all text snippets in a record."""
        text = string_value(self.text)
        if text is not None:
            yield self.text

    def to_dict(self):
        return {
            'id': self.id,
            'number': self.number,
            'text': self.text,
            'document_id': self.document_id
        }
Пример #3
0
class DocumentRecord(db.Model):

    id = db.Column(db.BigInteger, primary_key=True)
    sheet = db.Column(db.Integer, nullable=False)
    row_id = db.Column(db.Integer, nullable=False)
    data = db.Column(JSONB)
    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        Document, backref=db.backref('records',
                                     cascade='all, delete-orphan'))  # noqa

    @property
    def tid(self):
        tid = sha1(str(self.document_id))
        tid.update(str(self.sheet))
        tid.update(str(self.row_id))
        return tid.hexdigest()

    def text_parts(self):
        """Utility method to get all text snippets in a record."""
        for value in self.data.values():
            text = string_value(value)
            if text is not None:
                yield value

    def __repr__(self):
        return '<DocumentRecord(%r,%r)>' % (self.document_id, self.row_id)
Пример #4
0
class Reference(db.Model, IdModel, DatedModel):
    id = db.Column(db.Integer(), primary_key=True)
    document_id = db.Column(db.BigInteger, db.ForeignKey('document.id'))
    entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'))
    origin = db.Column(db.String(128))
    weight = db.Column(db.Integer)

    entity = db.relationship('Entity',
                             backref=db.backref('references', lazy='dynamic'))
    document = db.relationship('Document',
                               backref=db.backref('references',
                                                  lazy='dynamic'))

    def to_dict(self):
        return {
            'entity': {
                'id': self.entity.id,
                'name': self.entity.name,
                '$schema': self.entity.type
            },
            'weight': self.weight,
            'origin': self.origin
        }

    def __repr__(self):
        return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
Пример #5
0
class DocumentRecord(db.Model):

    id = db.Column(db.BigInteger, primary_key=True)
    sheet = db.Column(db.Integer, nullable=False)
    row_id = db.Column(db.Integer, nullable=False)
    data = db.Column(JSONB)
    document_id = db.Column(db.Integer(), db.ForeignKey('document.id'))
    document = db.relationship(
        Document, backref=db.backref('records',
                                     cascade='all, delete-orphan'))  # noqa

    @property
    def tid(self):
        tid = sha1(str(self.document_id))
        tid.update(str(self.sheet))
        tid.update(str(self.row_id))
        return tid.hexdigest()

    @property
    def text(self):
        if self.data is None:
            return []
        text = [t for t in self.data.values() if t is not None]
        return list(set(text))

    def __repr__(self):
        return '<DocumentRecord(%r,%r)>' % (self.document_id, self.row_id)
Пример #6
0
class IdModel(object):
    id = db.Column(db.Integer(), primary_key=True)

    def to_dict(self):
        parent = super(IdModel, self)
        data = parent.to_dict() if hasattr(parent, 'to_dict') else {}
        data['id'] = self.id
        return data
Пример #7
0
class DocumentTag(db.Model, IdModel):
    """A record reflects an entity or tag extracted from a document."""
    TEXT_LENGTH = 1024

    TYPE_PHONE = 'phone'
    TYPE_EMAIL = 'email'
    TYPE_PERSON = 'person'
    TYPE_ORGANIZATION = 'organization'
    TYPE_LOCATION = 'location'
    TYPE_IP = 'ip'
    TYPE_IBAN = 'iban'

    TYPES = {
        TYPE_PERSON: exactitude.names,
        TYPE_ORGANIZATION: exactitude.names,
        TYPE_EMAIL: exactitude.emails,
        TYPE_PHONE: exactitude.phones,
        TYPE_LOCATION: exactitude.addresses,
        TYPE_IP: exactitude.ips,
        TYPE_IBAN: exactitude.ibans,
    }

    id = db.Column(db.BigInteger, primary_key=True)
    origin = db.Column(db.Unicode(255), nullable=False, index=True)
    type = db.Column(db.Unicode(16), nullable=False)
    weight = db.Column(db.Integer, default=1)
    text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True)

    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        "Document", backref=db.backref('tags',
                                       cascade='all, delete-orphan'))  # noqa

    @property
    def field(self):
        type_ = self.TYPES[self.type]
        for (candidate, invert) in TYPES.values():
            if candidate == type_:
                return invert

    @classmethod
    def delete_by(cls, document_id=None, origin=None, type=None):
        pq = db.session.query(cls)
        assert document_id or origin or type
        if document_id is not None:
            pq = pq.filter(cls.document_id == document_id)
        if origin is not None:
            pq = pq.filter(cls.origin == origin)
        if type is not None:
            pq = pq.filter(cls.type == type)
        pq.delete()
        db.session.flush()

    def __repr__(self):
        return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
Пример #8
0
class DocumentTag(db.Model, IdModel):
    """A record reflects an entity or tag extracted from a document."""
    TEXT_LENGTH = 1024

    TYPE_PHONE = 'phone'
    TYPE_EMAIL = 'email'
    TYPE_PERSON = 'person'
    TYPE_ORGANIZATION = 'organization'
    TYPE_LOCATION = 'location'
    TYPE_IP = 'ip'
    TYPE_IBAN = 'iban'
    TYPE_COUNTRY = 'country'
    TYPE_LANGUAGE = 'language'

    MAPPING = {
        TYPE_PERSON: 'namesMentioned',
        TYPE_ORGANIZATION: 'namesMentioned',
        TYPE_EMAIL: 'emailMentioned',
        TYPE_PHONE: 'phoneMentioned',
        TYPE_LOCATION: 'locationMentioned',
        TYPE_IP: 'ipMentioned',
        TYPE_IBAN: 'ibanMentioned',
        TYPE_COUNTRY: 'country',
        TYPE_LANGUAGE: 'language'
    }

    id = db.Column(db.BigInteger, primary_key=True)
    origin = db.Column(db.Unicode(255), nullable=False, index=True)
    type = db.Column(db.Unicode(16), nullable=False)
    weight = db.Column(db.Integer, default=1)
    text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True)

    document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True)  # noqa
    document = db.relationship("Document", backref=db.backref('tags', cascade='all, delete-orphan'))  # noqa

    @property
    def field(self):
        type_ = registry.get(self.type)
        if type_ is not None and type_.group is not None:
            return type_.group

    @classmethod
    def delete_by(cls, document_id=None, origin=None, type=None):
        pq = db.session.query(cls)
        assert document_id or origin or type
        if document_id is not None:
            pq = pq.filter(cls.document_id == document_id)
        if origin is not None:
            pq = pq.filter(cls.origin == origin)
        if type is not None:
            pq = pq.filter(cls.type == type)
        pq.delete()
        db.session.flush()

    def __repr__(self):
        return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
Пример #9
0
class EntityIdentity(db.Model, IdModel, DatedModel):
    CONFIRMED = 1
    REJECTED = 2
    UNDECIDED = 3

    JUDGEMENTS = [1, 2, 3]

    entity_id = db.Column(db.String(32),
                          db.ForeignKey('entity.id'),
                          index=True)  # noqa
    entity = db.relationship('Entity',
                             backref=db.backref('identities',
                                                lazy='dynamic'))  # noqa
    match_id = db.Column(db.String(254), index=True, nullable=False)
    judgement = db.Column(db.Integer(), nullable=False)
    judge_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True)

    @classmethod
    def judgements_by_entity(cls, entity_id):
        q = db.session.query(cls.match_id, cls.judgement)
        q = q.filter(cls.entity_id == entity_id)
        return {k: v for k, v in q.all()}

    @classmethod
    def entity_ids(cls, entity_id):
        q = db.session.query(cls.match_id)
        q = q.filter(cls.entity_id == entity_id)
        q = q.filter(cls.judgement == cls.CONFIRMED)
        ids = [entity_id]
        for mapped_id, in q.all():
            ids.append(mapped_id)
        return ids

    @classmethod
    def by_entity_match(cls, entity_id, match_id):
        q = db.session.query(cls)
        q = q.filter(cls.entity_id == entity_id)
        q = q.filter(cls.match_id == match_id)
        return q.first()

    @classmethod
    def save(cls, entity_id, match_id, judgement, judge=None):
        obj = cls.by_entity_match(entity_id, match_id)
        if obj is None:
            obj = cls()
            obj.entity_id = entity_id
            obj.match_id = match_id
        obj.judgement = judgement
        obj.judge = judge
        db.session.add(obj)
        return obj

    def __repr__(self):
        return 'EntityIdentity(%r, %r, %r)' % (self.entity_id, self.match_id,
                                               self.judgement)
Пример #10
0
class DocumentPage(db.Model):

    id = db.Column(db.BigInteger, primary_key=True)
    number = db.Column(db.Integer(), nullable=False)
    text = db.Column(db.Unicode(), nullable=False)
    document_id = db.Column(db.Integer(), db.ForeignKey('document.id'))
    document = db.relationship(
        Document, backref=db.backref('pages',
                                     cascade='all, delete-orphan'))  # noqa

    def __repr__(self):
        return '<DocumentPage(%r,%r)>' % (self.document_id, self.number)

    def to_dict(self):
        return {
            'id': self.id,
            'number': self.number,
            'text': self.text,
            'document_id': self.document_id
        }
Пример #11
0
class DocumentRecord(db.Model):
    """A record reflects a row or page of a document."""

    id = db.Column(db.BigInteger, primary_key=True)
    sheet = db.Column(db.Integer, nullable=True)
    index = db.Column(db.Integer, nullable=True, index=True)
    text = db.Column(db.Unicode, nullable=True)
    data = db.Column(JSONB, nullable=True)

    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        "Document",
        backref=db.backref('records', cascade='all, delete-orphan'))  # noqa

    def text_parts(self):
        """Utility method to get all text snippets in a record."""
        if self.data is not None:
            for value in self.data.values():
                text = string_value(value)
                if text is not None:
                    yield text
        text = string_value(self.text)
        if text is not None:
            yield text

    @classmethod
    def find_records(cls, document_id, ids):
        if not len(ids):
            return []
        q = db.session.query(cls)
        q = q.filter(cls.document_id == document_id)
        q = q.filter(cls.id.in_(ids))
        return q

    def to_dict(self):
        return {
            'id': self.id,
            'sheet': self.sheet,
            'index': self.index,
            'data': self.data,
            'text': self.text,
            'document_id': self.document_id
        }

    def __repr__(self):
        return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
Пример #12
0
class Reference(db.Model, IdModel, DatedModel):
    id = db.Column(db.Integer(), primary_key=True)
    origin = db.Column(db.String(128))
    weight = db.Column(db.Integer)

    document_id = db.Column(db.BigInteger,
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship('Document',
                               backref=db.backref('references',
                                                  lazy='dynamic'))  # noqa

    entity_id = db.Column(db.String(32),
                          db.ForeignKey('entity.id'),
                          index=True)  # noqa
    entity = db.relationship('Entity',
                             backref=db.backref('references',
                                                lazy='dynamic'))  # noqa

    @classmethod
    def index_references(cls, document_id):
        """Helper function to get reference data for indexing."""
        # cf. aleph.index.entities.generate_entities()
        from aleph.model.entity import Entity
        q = db.session.query(Reference.entity_id, Entity.collection_id)
        q = q.filter(Reference.document_id == document_id)
        q = q.filter(Entity.id == Reference.entity_id)
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        return q.all()

    def to_dict(self):
        return {
            'entity': {
                'id': self.entity.id,
                'name': self.entity.name,
                '$schema': self.entity.type
            },
            'weight': self.weight,
            'origin': self.origin
        }

    def __repr__(self):
        return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
Пример #13
0
class Reference(db.Model, IdModel, DatedModel):
    id = db.Column(db.Integer(), primary_key=True)
    document_id = db.Column(db.BigInteger, db.ForeignKey('document.id'))
    entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'))
    weight = db.Column(db.Integer)

    entity = db.relationship(Entity,
                             backref=db.backref('references', lazy='dynamic'))
    document = db.relationship(Document,
                               backref=db.backref('references',
                                                  lazy='dynamic'))

    @classmethod
    def delete_document(cls, document_id):
        q = cls.all().filter_by(document_id=document_id)
        q.delete(synchronize_session='fetch')

    def __repr__(self):
        return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
Пример #14
0
class DocumentRecord(db.Model):
    """A record reflects a row or page of a document."""

    id = db.Column(db.BigInteger, primary_key=True)
    sheet = db.Column(db.Integer, nullable=True)
    index = db.Column(db.Integer, nullable=True, index=True)
    text = db.Column(db.Unicode, nullable=True)
    data = db.Column(JSONB, nullable=True)

    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        "Document",
        backref=db.backref('records', cascade='all, delete-orphan'))  # noqa

    @property
    def texts(self):
        """Utility method to get all text snippets in a record."""
        if self.data is not None:
            for value in self.data.values():
                yield value
        yield self.text

    @classmethod
    def find_records(cls, ids):
        if not len(ids):
            return []
        q = db.session.query(cls)
        q = q.filter(cls.id.in_(ids))
        return q

    @classmethod
    def by_index(cls, document_id, index):
        q = db.session.query(cls)
        q = db.session.query(DocumentRecord)
        q = q.filter(cls.document_id == document_id)
        q = q.filter(cls.index == index)
        return q.first()

    def __repr__(self):
        return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
Пример #15
0
class EntityTag(db.Model):
    id = db.Column(db.Integer(), primary_key=True)
    collection = db.Column(db.Unicode(100))
    package_id = db.Column(db.Unicode(100))

    entity_id = db.Column(db.Unicode(50), db.ForeignKey('entity.id'))
    entity = db.relationship(Entity,
                             backref=db.backref('tags', lazy='dynamic'))

    created_at = db.Column(db.DateTime, default=datetime.utcnow)

    @classmethod
    def delete_set(cls, collection, package_id):
        q = db.session.query(cls)
        q = q.filter_by(collection=collection)
        q = q.filter_by(package_id=package_id)
        q.delete()

    @classmethod
    def by_package(cls, collection, package_id):
        etag = aliased(cls)
        ent = aliased(Entity)
        q = db.session.query(etag.entity_id, ent.label, ent.category,
                             ent.list_id)
        q = q.join(ent, ent.id == etag.entity_id)
        q = q.filter(etag.collection == collection)
        q = q.filter(etag.package_id == package_id)
        entities = []
        for entity_id, label, category, lst in q.all():
            entities.append({
                'id': entity_id,
                'entity': entity_id,
                'label': label,
                'category': category,
                'list': lst
            })
        return entities

    def __repr__(self):
        return '<EntityTag(%r, %r)>' % (self.package_id, self.entity_id)
class DocumentRecord(db.Model):

    id = db.Column(db.BigInteger, primary_key=True)
    sheet = db.Column(db.Integer, nullable=False)
    row_id = db.Column(db.Integer, nullable=False)
    data = db.Column(JSONB)
    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        Document, backref=db.backref('records',
                                     cascade='all, delete-orphan'))  # noqa

    @property
    def tid(self):
        tid = sha1(str(self.document_id))
        tid.update(str(self.sheet))
        tid.update(str(self.row_id))
        return tid.hexdigest()

    def text_parts(self):
        """Utility method to get all text snippets in a record."""
        for value in self.data.values():
            text = string_value(value)
            if text is not None:
                yield value

    @classmethod
    def find_rows(cls, document_id, rows):
        if not len(rows):
            return []
        q = db.session.query(cls)
        q = q.filter(cls.document_id == document_id)
        clauses = [and_(cls.sheet == r[0], cls.row_id == r[1]) for r in rows]
        q = q.filter(or_(*clauses))
        return q

    def __repr__(self):
        return '<DocumentRecord(%r,%r)>' % (self.document_id, self.row_id)
Пример #17
0
class DocumentTag(db.Model, IdModel):
    """A record reflects an entity or tag extracted from a document."""

    TYPE_PHONE = 'phone'
    TYPE_EMAIL = 'email'
    TYPE_PERSON = 'person'
    TYPE_ORGANIZATION = 'organization'
    TYPE_LOCATION = 'location'

    id = db.Column(db.BigInteger, primary_key=True)
    origin = db.Column(db.Unicode(255), nullable=False, index=True)
    type = db.Column(db.Unicode(16), nullable=False)
    weight = db.Column(db.Integer, default=1)
    key = db.Column(db.Unicode(1024), nullable=False, index=True)
    text = db.Column(db.Unicode(1024), nullable=True)

    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        "Document", backref=db.backref('tags',
                                       cascade='all, delete-orphan'))  # noqa

    @classmethod
    def delete_by(cls, document_id=None, origin=None, type=None):
        pq = db.session.query(cls)
        assert document_id or origin or type
        if document_id is not None:
            pq = pq.filter(cls.document_id == document_id)
        if origin is not None:
            pq = pq.filter(cls.origin == origin)
        if type is not None:
            pq = pq.filter(cls.type == type)
        pq.delete()
        db.session.flush()

    def __repr__(self):
        return '<DocumentTag(%r,%r)>' % (self.document_id, self.key)
Пример #18
0
class DocumentRecord(db.Model):
    """A record reflects a row or page of a document."""
    SCHEMA_ROW = 'Row'
    SCHEMA_PAGE = 'Page'

    id = db.Column(db.BigInteger, primary_key=True)
    index = db.Column(db.Integer, nullable=True, index=True)
    text = db.Column(db.Unicode, nullable=True)
    data = db.Column(JSONB, nullable=True)

    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        "Document",
        backref=db.backref('records', cascade='all, delete-orphan'))  # noqa

    def raw_texts(self):
        """Utility method to get all text snippets in a record."""
        if self.data is not None:
            for value in self.data.values():
                yield value
        yield self.text

    @property
    def texts(self):
        yield from filter_texts(self.raw_texts())

    @classmethod
    def insert_records(cls, document_id, iterable, chunk_size=1000):
        chunk = []
        table = cls.__table__
        for index, data in enumerate(iterable):
            chunk.append({
                'document_id': document_id,
                'index': index,
                'data': data
            })
            if len(chunk) >= chunk_size:
                q = table.insert().values(chunk)
                db.session.execute(q)
                chunk = []

        if len(chunk):
            q = table.insert().values(chunk)
            db.session.execute(q)

    def to_proxy(self):
        if self.text is not None:
            proxy = model.make_entity(self.SCHEMA_PAGE)
            proxy.make_id('record', self.id)
            proxy.set('document', self.document_id)
            proxy.set('index', self.index)
            proxy.set('bodyText', stringify(self.text))
            return proxy
        else:
            proxy = model.make_entity(self.SCHEMA_ROW)
            proxy.make_id('record', self.id)
            proxy.set('table', self.document_id)
            proxy.set('index', self.index)
            if self.data is not None:
                values = [v for (k, v) in sorted(self.data.items())]
                proxy.set('cells', registry.json.pack(values))
            return proxy

    def to_dict(self):
        proxy = self.to_proxy()
        data = proxy.to_full_dict()
        data.update({
            'document_id': self.document_id,
            'bulk': False,
        })
        return data

    def __repr__(self):
        return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
Пример #19
0
class Entity(db.Model):
    id = db.Column(db.Unicode(50), primary_key=True, default=make_textid)
    label = db.Column(db.Unicode)
    category = db.Column(db.Enum(*CATEGORIES, name='entity_categories'),
                         nullable=False)

    creator_id = db.Column(db.Integer(), db.ForeignKey('user.id'))
    creator = db.relationship(User,
                              backref=db.backref('entities',
                                                 lazy='dynamic',
                                                 cascade='all, delete-orphan'))

    list_id = db.Column(db.Integer(), db.ForeignKey('list.id'))
    list = db.relationship('List',
                           backref=db.backref('entities',
                                              lazy='dynamic',
                                              cascade='all, delete-orphan'))

    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime,
                           default=datetime.utcnow,
                           onupdate=datetime.utcnow)

    def to_dict(self):
        return {
            'id': self.id,
            'api_url': url_for('entities.view', id=self.id),
            'label': self.label,
            'category': self.category,
            'creator_id': self.creator_id,
            'selectors': [s.text for s in self.selectors],
            'list': self.list_id,
            'created_at': self.created_at,
            'updated_at': self.updated_at
        }

    def has_selector(self, text):
        normalized = Selector.normalize(text)
        for selector in self.selectors:
            if selector.normalized == normalized:
                return True
        return False

    def delete(self):
        db.session.delete(self)

    @classmethod
    def create(cls, data, user):
        ent = cls()
        ent.update(data)
        ent.creator = user
        db.session.add(ent)
        return ent

    def update(self, data):
        data = EntityForm().deserialize(data)
        self.label = data.get('label')
        self.list = data.get('list')
        self.category = data.get('category')

        selectors = set(data.get('selectors'))
        selectors.add(self.label)
        existing = list(self.selectors)
        for sel in list(existing):
            if sel.text in selectors:
                selectors.remove(sel.text)
                existing.remove(sel)
        for sel in existing:
            db.session.delete(sel)
        for text in selectors:
            sel = Selector()
            sel.entity = self
            sel.text = text
            db.session.add(sel)

    @classmethod
    def by_normalized_label(cls, label, lst):
        q = db.session.query(cls)
        q = q.filter_by(list=lst)
        q = q.filter(db_compare(cls.label, label))
        return q.first()

    @classmethod
    def by_id(cls, id):
        q = db.session.query(cls).filter_by(id=id)
        return q.first()

    @classmethod
    def by_lists(cls, lists, prefix=None):
        q = db.session.query(cls)
        q = q.filter(cls.list_id.in_(lists))
        if prefix is not None and len(prefix):
            q = q.join(Selector, cls.id == Selector.entity_id)
            q = cls.apply_filter(q, Selector.normalized, prefix)
        q = q.order_by(cls.label.asc())
        return q

    @classmethod
    def by_id_set(cls, ids):
        if not len(ids):
            return {}
        q = db.session.query(cls)
        q = q.filter(cls.id.in_(ids))
        entities = {}
        for ent in q:
            entities[ent.id] = ent
        return entities

    @classmethod
    def apply_filter(cls, q, col, prefix):
        prefix = Selector.normalize(prefix)
        return q.filter(
            or_(col.like('%s%%' % prefix), col.like('%% %s%%' % prefix)))

    @classmethod
    def suggest_prefix(cls, prefix, lists, limit=10):
        from aleph.model import EntityTag
        ent = aliased(Entity)
        sel = aliased(Selector)
        tag = aliased(EntityTag)
        q = db.session.query(ent.id, ent.label, ent.category)
        q = q.join(sel, ent.id == sel.entity_id)
        q = q.join(tag, ent.id == tag.entity_id)
        q = q.filter(ent.list_id.in_(lists))
        if prefix is None or not len(prefix):
            return []
        q = cls.apply_filter(q, sel.normalized, prefix)
        q = q.order_by(ent.label.asc())
        q = q.limit(limit)
        q = q.distinct()
        suggestions = []
        for entity_id, label, category in q.all():
            suggestions.append({
                'id': entity_id,
                'label': label,
                'category': category
            })
        return suggestions

    @property
    def terms(self):
        return set([s.normalized for s in self.selectors])

    def __repr__(self):
        return '<Entity(%r, %r)>' % (self.id, self.label)

    def __unicode__(self):
        return self.label
Пример #20
0
class List(db.Model):
    id = db.Column(db.Integer(), primary_key=True)
    label = db.Column(db.Unicode)
    public = db.Column(db.Boolean, default=False)

    creator_id = db.Column(db.Integer(),
                           db.ForeignKey('user.id'),
                           nullable=True)
    creator = db.relationship(User)

    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime,
                           default=datetime.utcnow,
                           onupdate=datetime.utcnow)

    users = db.relationship(User, secondary=list_user_table, backref='lists')

    def to_dict(self):
        return {
            'id': self.id,
            'api_url': url_for('lists.view', id=self.id),
            'entities_api_url': url_for('entities.index', list=self.id),
            'label': self.label,
            'public': self.public,
            'creator_id': self.creator_id,
            'created_at': self.created_at,
            'updated_at': self.updated_at
        }

    @classmethod
    def create(cls, data, user):
        lst = cls()
        lst.update(data, user)
        lst.creator = user
        db.session.add(lst)
        return lst

    def update(self, data, user):
        data = ListForm().deserialize(data)
        self.label = data.get('label')
        if data.get('public') is not None:
            self.public = data.get('public')
        users = set(data.get('users', []))
        if user is not None:
            users.add(user)
        self.users = list(users)

    def delete(self):
        # for entity in self.entities:
        #     entity.delete()
        db.session.delete(self)

    @classmethod
    def by_label(cls, label):
        q = db.session.query(cls).filter_by(label=label)
        return q.first()

    @classmethod
    def by_id(cls, id):
        q = db.session.query(cls).filter_by(id=id)
        return q.first()

    @classmethod
    def user_list_ids(cls, user=None, include_public=True):
        logged_in = user is not None and user.is_authenticated()
        q = db.session.query(cls.id)
        conds = []
        if include_public:
            conds.append(cls.public == True)  # noqa
        if logged_in:
            conds.append(cls.users.any(User.id == user.id))
        if not len(conds):
            return []
        if not (logged_in and user.is_admin):
            q = q.filter(or_(*conds))
        return [c.id for c in q.all()]

    @classmethod
    def all_by_user(cls, user):
        q = db.session.query(cls)
        q = q.filter(cls.id.in_(cls.user_list_ids(user)))
        q = q.order_by(cls.id.desc())
        return q

    @property
    def terms(self):
        from aleph.model.entity import Entity
        from aleph.model.selector import Selector
        q = db.session.query(Selector.normalized)
        q = q.join(Entity, Entity.id == Selector.entity_id)
        q = q.filter(Entity.list_id == self.id)
        q = q.distinct()
        return set([r[0] for r in q])

    def __repr__(self):
        return '<List(%r, %r)>' % (self.id, self.label)

    def __unicode__(self):
        return self.label
Пример #21
0
class Entity(db.Model, UuidModel, SoftDeleteModel, SchemaModel):
    _schema = '/entity/entity.json#'
    _schema_recurse = True

    name = db.Column(db.Unicode)
    type = db.Column('type', db.String(255), index=True)
    summary = db.Column(db.Unicode, nullable=True)
    description = db.Column(db.Unicode, nullable=True)
    jurisdiction_code = db.Column(db.Unicode, nullable=True)

    __mapper_args__ = {'polymorphic_on': type, 'polymorphic_identity': _schema}

    collection_id = db.Column(db.Integer(), db.ForeignKey('collection.id'))
    collection = db.relationship(Collection,
                                 backref=db.backref(
                                     'entities',
                                     lazy='dynamic',
                                     cascade='all, delete-orphan'))  # noqa

    def delete(self):
        from aleph.model import Reference
        q = db.session.query(Reference)
        q = q.filter(Reference.entity_id == self.id)
        q.delete(synchronize_session='fetch')
        super(Entity, self).delete()

    def update(self, data, merge=False):
        self.schema_update(data, merge=merge)

    @classmethod
    def save(cls, data, collection_id=None, merge=False):
        ent = cls.by_id(data.get('id'))
        for identifier in data.get('identifiers', []):
            if ent is None:
                ent = cls.by_identifier(identifier.get('scheme'),
                                        identifier.get('identifier'),
                                        collection_id=collection_id)
        if ent is None:
            schema = data.get('$schema', cls._schema)
            cls = cls.get_schema_class(schema)
            ent = cls()
            ent.id = make_textid()
            if collection_id is not None:
                ent.collection_id = collection_id
        ent.update(data, merge=merge)
        return ent

    @property
    def terms(self):
        terms = set([self.name])
        # for other_name in self.other_names:
        #    terms.update(other_name.terms)
        return [t for t in terms if t is not None and len(t)]

    @classmethod
    def by_identifier(cls, scheme, identifier, collection_id=None):
        ent = aliased(Entity)
        q = db.session.query(ent)
        q = q.filter(ent.deleted_at == None)  # noqa
        if collection_id is not None:
            q = q.filter(ent.collection_id == collection_id)

        ident = aliased(EntityIdentifier)
        q = q.join(ident, ent.identifiers)
        q = q.filter(ident.deleted_at == None)  # noqa
        q = q.filter(ident.scheme == scheme)
        q = q.filter(ident.identifier == identifier)
        return q.first()

    @classmethod
    def by_id_set(cls, ids, collection_id=None):
        if not len(ids):
            return {}
        q = cls.all()
        q = q.filter(cls.id.in_(ids))
        if collection_id is not None:
            q = q.filter(cls.collection_id == collection_id)
        entities = {}
        for ent in q:
            entities[ent.id] = ent
        return entities

    @classmethod
    def suggest_prefix(cls, prefix, collections, limit=10):
        if prefix is None or not len(prefix):
            return []
        prefix = prefix.strip()
        ent = aliased(Entity)
        q = db.session.query(ent.id, ent.name, ent.type)
        q = q.filter(ent.deleted_at == None)  # noqa
        q = q.filter(ent.collection_id.in_(collections))
        q = q.filter(
            or_(ent.name.ilike('%s%%' % prefix),
                ent.name.ilike('%% %s%%' % prefix)))
        q = q.limit(limit)
        suggestions = []
        for entity_id, name, schema in q.all():
            suggestions.append({
                'id': entity_id,
                'name': name,
                '$schema': schema
            })
        return suggestions

    def __repr__(self):
        return '<Entity(%r, %r)>' % (self.id, self.name)

    def __unicode__(self):
        return self.name

    def to_dict(self):
        data = super(Entity, self).to_dict()
        data['collection_id'] = self.collection_id
        return data
Пример #22
0
class DocumentRecord(db.Model):
    """A record reflects a row or page of a document."""

    id = db.Column(db.BigInteger, primary_key=True)
    sheet = db.Column(db.Integer, nullable=True)
    index = db.Column(db.Integer, nullable=True, index=True)
    text = db.Column(db.Unicode, nullable=True)
    data = db.Column(JSONB, nullable=True)

    document_id = db.Column(db.Integer(),
                            db.ForeignKey('document.id'),
                            index=True)  # noqa
    document = db.relationship(
        "Document",
        backref=db.backref('records', cascade='all, delete-orphan'))  # noqa

    def raw_texts(self):
        """Utility method to get all text snippets in a record."""
        if self.data is not None:
            for value in self.data.values():
                yield value
        yield self.text

    @property
    def texts(self):
        yield from filter_texts(self.raw_texts())

    @classmethod
    def insert_records(cls, document_id, iterable, chunk_size=1000):
        chunk = []
        table = cls.__table__
        for index, data in enumerate(iterable):
            chunk.append({
                'document_id': document_id,
                'index': index,
                'data': data
            })
            if len(chunk) >= chunk_size:
                q = table.insert().values(chunk)
                db.session.execute(q)
                chunk = []

        if len(chunk):
            q = table.insert().values(chunk)
            db.session.execute(q)

    @classmethod
    def find_records(cls, ids):
        if not len(ids):
            return []
        q = db.session.query(cls)
        q = q.filter(cls.id.in_(ids))
        return q

    @classmethod
    def by_index(cls, document_id, index):
        q = db.session.query(cls)
        q = db.session.query(DocumentRecord)
        q = q.filter(cls.document_id == document_id)
        q = q.filter(cls.index == index)
        return q.first()

    def __repr__(self):
        return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
Пример #23
0
class IdModel(object):
    id = db.Column(db.Integer(), primary_key=True)
Пример #24
0
class Role(db.Model, RoleMixin):
    id = db.Column(db.Integer(), primary_key=True)
    name = db.Column(db.String(80), unique=True)
    description = db.Column(db.String(255))
Пример #25
0
from aleph.model.util import make_token
from aleph.model.forms import UserForm
from flask.ext.security import Security, SQLAlchemyUserDatastore, \
    UserMixin, RoleMixin, login_required
from flask.ext.security.utils import encrypt_password, get_hmac

log = logging.getLogger(__name__)


@login_manager.user_loader
def load_user(id):
    return User.query.get(int(id))


roles_users = db.Table(
    'roles_users', db.Column('user_id', db.Integer(),
                             db.ForeignKey('user.id')),
    db.Column('role_id', db.Integer(), db.ForeignKey('role.id')))


class Role(db.Model, RoleMixin):
    id = db.Column(db.Integer(), primary_key=True)
    name = db.Column(db.String(80), unique=True)
    description = db.Column(db.String(255))


class User(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    email = db.Column(
        db.Unicode,
        #following attributes are for flask-user
Пример #26
0
class CrawlerState(db.Model):
    """Report the state of a file being processed."""

    TIMEOUT = timedelta(minutes=60)

    STATUS_OK = 'ok'
    STATUS_FAIL = 'fail'

    id = db.Column(db.BigInteger, primary_key=True)
    crawler_id = db.Column(db.Unicode(), index=True)
    crawler_run = db.Column(db.Unicode(), nullable=True)
    content_hash = db.Column(db.Unicode(65), nullable=True)
    foreign_id = db.Column(db.Unicode, nullable=True)
    status = db.Column(db.Unicode(10), nullable=False)
    error_type = db.Column(db.Unicode(), nullable=True)
    error_message = db.Column(db.Unicode(), nullable=True)
    error_details = db.Column(db.Unicode(), nullable=True)
    meta = db.Column(JSONB)
    collection_id = db.Column(db.Integer(),
                              db.ForeignKey('collection.id'),
                              index=True)
    collection = db.relationship(Collection,
                                 backref=db.backref(
                                     'crawl_states',
                                     cascade='all, delete-orphan'))  # noqa
    created_at = db.Column(db.DateTime, default=datetime.utcnow)

    @classmethod
    def _from_meta(cls, meta, collection_id):
        obj = cls()
        obj.collection_id = collection_id
        obj.crawler_id = meta.crawler
        obj.crawler_run = meta.crawler_run
        obj.foreign_id = meta.foreign_id
        obj.content_hash = meta.content_hash
        obj.meta = expand_json(meta.to_attr_dict(compute=True))
        db.session.add(obj)
        return obj

    @classmethod
    def store_stub(cls, collection_id, crawler_id, crawler_run):
        obj = cls()
        obj.collection_id = collection_id
        obj.crawler_id = crawler_id
        obj.crawler_run = crawler_run
        obj.error_type = 'init'
        obj.status = cls.STATUS_OK
        db.session.add(obj)
        return obj

    @classmethod
    def store_ok(cls, meta, collection_id):
        obj = cls._from_meta(meta, collection_id)
        obj.status = cls.STATUS_OK
        return obj

    @classmethod
    def store_fail(cls,
                   meta,
                   collection_id,
                   error_type=None,
                   error_message=None,
                   error_details=None):
        obj = cls._from_meta(meta, collection_id)
        obj.status = cls.STATUS_FAIL
        obj.error_type = error_type
        obj.error_message = error_message
        obj.error_details = error_details
        return obj

    @classmethod
    def crawler_last_run(cls, crawler_id):
        q = db.session.query(cls.crawler_run, cls.created_at)
        q = q.filter(cls.crawler_id == crawler_id)
        q = q.order_by(cls.created_at.desc())
        q = q.limit(1)
        res = q.first()
        if res is None:
            return None, None
        return (res.crawler_run, res.created_at)

    @classmethod
    def crawler_stats(cls, crawler_id):
        stats = {}
        last_run_id, last_run_time = cls.crawler_last_run(crawler_id)

        # Check if the crawler was active very recently, if so, don't
        # allow the user to execute a new run right now.
        timeout = (datetime.utcnow() - CrawlerState.TIMEOUT)
        stats['running'] = last_run_time > timeout if last_run_time else False

        q = db.session.query(func.count(cls.id))
        q = q.filter(cls.crawler_id == crawler_id)
        for section in ['last', 'all']:
            data = {}
            sq = q
            if section == 'last':
                sq = sq.filter(cls.crawler_run == last_run_id)
            okq = sq.filter(cls.status == cls.STATUS_OK)
            data['ok'] = okq.scalar() if last_run_id else 0
            failq = sq.filter(cls.status == cls.STATUS_FAIL)
            data['fail'] = failq.scalar() if last_run_id else 0
            stats[section] = data
        stats['last']['updated'] = last_run_time
        stats['last']['run_id'] = last_run_id
        return stats

    @classmethod
    def all(cls):
        return db.session.query(CrawlerState)

    def to_dict(self):
        return {
            'id': self.id,
            'status': self.status,
            'crawler_id': self.crawler_id,
            'crawler_run': self.crawler_run,
            'content_hash': self.content_hash,
            'foreign_id': self.foreign_id,
            'error_type': self.error_type,
            'error_message': self.error_message,
            'error_details': self.error_details,
            'meta': self.meta,
            'collection_id': self.collection_id,
            'created_at': self.created_at
        }

    def __repr__(self):
        return '<CrawlerState(%r,%r)>' % (self.id, self.status)

    def __unicode__(self):
        return self.id
Пример #27
0
class Alert(db.Model):
    '''
    Also consider adding:
    - active/inactive
    - label (short human-readable text)
    '''
    id = db.Column(db.Integer, primary_key=True)
    user_id = db.Column(db.Integer(), db.ForeignKey('user.id'))
    user = db.relationship(User, backref=db.backref('alerts'))
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    checked_at = db.Column(
        db.DateTime,
        default=None,
    )
    query = db.Column(db.Unicode)
    label = db.Column(db.Unicode)
    checking_interval = db.Column(db.Integer, default=None)

    # number of days between checks. None == 'never check'

    def due_to_check(self):
        '''
        Return True if it is time to run this query
        
        NB We expect this script to run at nearly-but-not-precisely
        the same time each day, and we want to run at an intuitive
        'once per day', rather than skipping because today's run
        has happened a few seconds earlier.
        Therefore we allow 2 hours of wiggle room
        [we aren't worried about sending duplicate alerts, because that
        will be handled precisely by filtering result insert dates against
        the checked_at field
        '''
        if self.checking_interval == None:  # query is disabled
            return False
        if self.checked_at == None:  # query is being run for the first time
            return True
        min_check_date = datetime.utcnow() - timedelta(
            days=self.checking_interval) + timedelta(hours=2)
        return self.checked_at <= min_check_date

    def mark_as_checked(self):
        self.checked_at = datetime.utcnow()
        db.session.add(self)
        db.session.commit()

    def to_dict(self):
        attrs = ('id', 'label', 'query', 'checking_interval', 'user_id',
                 'created_at', 'checked_at')
        return {attr: getattr(self, attr) for attr in attrs}

    @property
    def search_url(self):
        '''
        where to go to reach the original search
        '''
        return 'http://search.openoil.net/#/search?q=' + urllib.parse.quote_plus(
            self.query)

    @classmethod
    def by_id(cls, id, role=None):
        q = db.session.query(cls).filter_by(id=id)
        if role is not None:  #only applies if we are using authz roles
            q = q.filter(cls.role_id == role.id)
        return q.first()
Пример #28
0
class Document(db.Model, DatedModel):
    TYPE_TEXT = 'text'
    TYPE_TABULAR = 'tabular'
    TYPE_OTHER = 'other'

    id = db.Column(db.BigInteger, primary_key=True)
    content_hash = db.Column(db.Unicode(65), nullable=False, index=True)
    foreign_id = db.Column(db.Unicode, unique=False, nullable=True)
    type = db.Column(db.Unicode(10), nullable=False, index=True)
    source_id = db.Column(db.Integer(), db.ForeignKey('source.id'), index=True)
    source = db.relationship(Source,
                             backref=db.backref(
                                 'documents',
                                 lazy='dynamic',
                                 cascade='all, delete-orphan'))  # noqa
    _meta = db.Column('meta', JSONB)

    @property
    def title(self):
        return self.meta.title

    @hybrid_property
    def meta(self):
        self._meta = self._meta or {}
        self._meta['content_hash'] = self.content_hash
        self._meta['foreign_id'] = self.foreign_id
        return Metadata(data=self._meta or {})

    @meta.setter
    def meta(self, meta):
        if isinstance(meta, Metadata):
            self.content_hash = meta.content_hash
            self.foreign_id = meta.foreign_id
            meta = meta.data
        self._meta = meta
        flag_modified(self, '_meta')

    def delete_pages(self):
        pq = db.session.query(DocumentPage)
        pq = pq.filter(DocumentPage.document_id == self.id)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def delete_records(self):
        pq = db.session.query(DocumentRecord)
        pq = pq.filter(DocumentRecord.document_id == self.id)
        pq.delete(synchronize_session='fetch')
        db.session.refresh(self)

    def insert_records(self, sheet, iterable, chunk_size=1000):
        chunk = []
        for i, data in enumerate(iterable):
            chunk.append({
                'document_id': self.id,
                'row_id': i,
                'sheet': sheet,
                'data': data
            })
            if len(chunk) >= chunk_size:
                db.session.bulk_insert_mappings(DocumentRecord, chunk)
                chunk = []

        if len(chunk):
            db.session.bulk_insert_mappings(DocumentRecord, chunk)

    def text_parts(self):
        """Utility method to get all text snippets in a document."""
        if self.type == Document.TYPE_TEXT:
            for page in self.pages:
                if page.text is not None and len(page.text):
                    yield page.text, page
        if self.type == Document.TYPE_TABULAR:
            for record in self.records:
                for value in record.data.values():
                    if isinstance(value, basestring) and len(value):
                        yield value, record

    @classmethod
    def get_max_id(cls):
        q = db.session.query(func.max(cls.id))
        return q.scalar()

    def __repr__(self):
        return '<Document(%r,%r,%r)>' % (self.id, self.type, self.meta.title)

    def _add_to_dict(self, data):
        data.update({
            'id': self.id,
            'type': self.type,
            'source_id': self.source_id,
            'created_at': self.created_at,
            'updated_at': self.updated_at
        })
        return data

    def to_dict(self):
        data = self.meta.to_dict()
        return self._add_to_dict(data)

    def to_index_dict(self):
        data = self.meta.to_index_dict()
        return self._add_to_dict(data)