Exemplo n.º 1
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                if term in matches:
                    matches[term].append(entity.id)
                else:
                    matches[term] = [entity.id]

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Exemplo n.º 2
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return

        self.latest = latest
        self.matches = defaultdict(set)

        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                self.matches[normalize_strong(term)].add(entity.id)

        self.regexes = []
        terms = self.matches.keys()
        terms = [t for t in terms if len(t) > 2]
        for i in count(0):
            terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
            if not len(terms_slice):
                break
            body = '|'.join(terms_slice)
            rex = re.compile('( |^)(%s)( |$)' % body)
            # rex = re.compile('(%s)' % body)
            self.regexes.append(rex)

        log.info('Generating entity tagger: %r (%s terms)', latest, len(terms))
Exemplo n.º 3
0
def delete_collection(collection, keep_metadata=False, sync=False):
    cancel_queue(collection)
    aggregator = get_aggregator(collection)
    try:
        aggregator.drop()
    finally:
        aggregator.close()
    flush_notifications(collection, sync=sync)
    index.delete_entities(collection.id, sync=sync)
    xref_index.delete_xref(collection, sync=sync)
    deleted_at = collection.deleted_at or datetime.utcnow()
    Entity.delete_by_collection(collection.id, deleted_at=deleted_at)
    Mapping.delete_by_collection(collection.id, deleted_at=deleted_at)
    Diagram.delete_by_collection(collection.id, deleted_at=deleted_at)
    Document.delete_by_collection(collection.id)
    if not keep_metadata:
        # Considering linkages metadata for now, might be wrong:
        Linkage.delete_by_collection(collection.id)
        Permission.delete_by_collection(collection.id, deleted_at=deleted_at)
        collection.delete(deleted_at=deleted_at)
    db.session.commit()
    if not keep_metadata:
        index.delete_collection(collection.id, sync=True)
        Authz.flush()
    refresh_collection(collection.id, sync=True)
Exemplo n.º 4
0
def delete_collection(collection_id, wait=False):
    # Deleting a collection affects many associated objects and requires
    # checks, so this is done manually and in detail here.
    q = db.session.query(Collection)
    q = q.filter(Collection.id == collection_id)
    collection = q.first()
    if collection is None:
        log.error("No collection with ID: %r", collection_id)
        return

    log.info("Deleting collection [%r]: %r", collection.id, collection.label)
    deleted_at = datetime.utcnow()
    index_delete(collection_id, wait=wait)

    log.info("Delete cross-referencing matches...")
    Match.delete_by_collection(collection_id)

    log.info("Delete permissions...")
    Permission.delete_by_collection(collection_id, deleted_at=deleted_at)

    log.info("Delete documents...")
    Document.delete_by_collection(collection_id, deleted_at=deleted_at)

    log.info("Delete entities...")
    Entity.delete_by_collection(collection_id, deleted_at=deleted_at)

    collection.delete(deleted_at=deleted_at)
    db.session.commit()
Exemplo n.º 5
0
 def setUp(self):
     super(EntitiesTestCase, self).setUp()
     self.rolex = self.create_user(foreign_id='user_3')
     self.col = Collection()
     self.col.label = 'Original Collection'
     self.col.foreign_id = 'test_coll_entities'
     db.session.add(self.col)
     self.col_other = Collection()
     self.col_other.label = 'Other Collection'
     self.col_other.foreign_id = 'test_coll_entities_other'
     db.session.add(self.col_other)
     db.session.flush()
     self.ent = Entity.create({
         'schema': 'LegalEntity',
         'properties': {
             'name': 'Winnie the Pooh',
             'country': 'pa',
             'summary': 'a fictional teddy bear created by A. A. Milne',
             'alias': ['Puh der Bär', 'Pooh Bear']
         }
     }, self.col)
     self.other = Entity.create({
         'schema': 'LegalEntity',
         'properties': {
             'name': 'Pu der Bär',
             'country': 'de',
             'description': 'he is a bear',
             'alias': ['Puh der Bär']
         }
     }, self.col)
     db.session.commit()
     index_entity(self.ent)
     index_entity(self.other)
Exemplo n.º 6
0
 def load_fixtures(self):
     self.private_coll = Collection.create({
         'foreign_id': 'test_private',
         'label': "Private Collection",
         'category': 'grey'
     })
     self._banana = Entity.create(
         {
             'schema': 'Person',
             'properties': {
                 'name': ['Banana'],
             }
         }, self.private_coll)
     user = Role.by_foreign_id(Role.SYSTEM_USER)
     Permission.grant(self.private_coll, user, True, False)
     self.public_coll = Collection.create({
         'foreign_id': 'test_public',
         'label': "Public Collection",
         'category': 'news'
     })
     self._kwazulu = Entity.create(
         {
             'schema': 'Company',
             'properties': {
                 'name': ['KwaZulu'],
                 'alias': ['kwazulu']
             }
         }, self.public_coll)
     visitor = Role.by_foreign_id(Role.SYSTEM_GUEST)
     Permission.grant(self.public_coll, visitor, True, False)
     db.session.commit()
     samples = read_entities(self.get_fixture_path('samples.ijson'))
     index_entities(self.private_coll, samples)
     process_collection(self.public_coll, ingest=False, reset=True)
     process_collection(self.private_coll, ingest=False, reset=True)
Exemplo n.º 7
0
def upsert_entity(data,
                  collection,
                  authz=None,
                  sync=False,
                  sign=False,
                  job_id=None):
    """Create or update an entity in the database. This has a side hustle
    of migrating entities created via the _bulk API or a mapper to a
    database entity in the event that it gets edited by the user.
    """
    entity = None
    entity_id = collection.ns.sign(data.get("id"))
    if entity_id is not None:
        entity = Entity.by_id(entity_id, collection=collection)
    if entity is None:
        role_id = authz.id if authz is not None else None
        entity = Entity.create(data, collection, sign=sign, role_id=role_id)
    else:
        entity.update(data, collection, sign=sign)

    proxy = entity.to_proxy()
    aggregator = get_aggregator(collection)
    aggregator.delete(entity_id=entity.id)
    aggregator.put(proxy, origin=MODEL_ORIGIN)

    index.index_proxy(collection, proxy, sync=sync)
    refresh_entity(collection, entity.id)
    queue_task(collection,
               OP_UPDATE_ENTITY,
               job_id=job_id,
               entity_id=entity.id)
    return entity.id
Exemplo n.º 8
0
    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        for entity in q:
            for term in entity.regex_terms:
                type_ = self.TYPES.get(entity.type)
                if type_ is None:
                    continue
                if term in matches:
                    matches[term].append((entity.name, type_))
                else:
                    matches[term] = [(entity.name, type_)]

        if not len(matches):
            return

        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Exemplo n.º 9
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return

        self.latest = latest
        self.matches = defaultdict(set)

        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                self.matches[normalize_strong(term)].add(entity.id)

        self.regexes = []
        terms = self.matches.keys()
        terms = [t for t in terms if len(t) > 2]
        for i in count(0):
            terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
            if not len(terms_slice):
                break
            body = '|'.join(terms_slice)
            rex = re.compile('( |^)(%s)( |$)' % body)
            # rex = re.compile('(%s)' % body)
            self.regexes.append(rex)

        log.info('Generating entity tagger: %r (%s terms)',
                 latest, len(terms))
Exemplo n.º 10
0
 def setUp(self):
     super(EntitiesTestCase, self).setUp()
     self.rolex = self.create_user(foreign_id='user_3')
     self.col = Collection()
     self.col.label = 'Original Collection'
     self.col.foreign_id = 'test_coll_entities'
     db.session.add(self.col)
     self.col_other = Collection()
     self.col_other.label = 'Other Collection'
     self.col_other.foreign_id = 'test_coll_entities_other'
     db.session.add(self.col_other)
     db.session.flush()
     self.ent = Entity.create({
         'schema': 'LegalEntity',
         'properties': {
             'name': 'Winnie the Pooh',
             'country': 'pa',
             'summary': 'a fictional teddy bear created by A. A. Milne',
             'alias': ['Puh der Bär', 'Pooh Bear']
         }
     }, self.col)
     self.other = Entity.create({
         'schema': 'LegalEntity',
         'properties': {
             'name': 'Pu der Bär',
             'country': 'de',
             'description': 'he is a bear',
             'alias': ['Puh der Bär']
         }
     }, self.col)
     db.session.commit()
Exemplo n.º 11
0
def delete_entities(collection_id, deleted_at=None):
    deleted_at = deleted_at or datetime.utcnow()
    log.info("Deleting entities...")
    Entity.delete_by_collection(collection_id, deleted_at=deleted_at)
    index.delete_entities(collection_id)
    log.info("Deleting cross-referencing matches...")
    Match.delete_by_collection(collection_id, deleted_at=deleted_at)
Exemplo n.º 12
0
def update(id):
    entity = obj_or_404(Entity.by_id(id))
    entity = Entity.save(get_data(entity=entity),
                         collection_id=entity.collection_id,
                         merge=arg_bool('merge'))
    db.session.commit()
    analyze_entity.delay(entity.id)
    return view(entity.id)
Exemplo n.º 13
0
def update(id):
    entity = obj_or_404(Entity.by_id(id))
    entity = Entity.save(get_data(entity=entity),
                         collection_id=entity.collection_id,
                         merge=arg_bool('merge'))
    db.session.commit()
    analyze_entity.delay(entity.id)
    return view(entity.id)
Exemplo n.º 14
0
def cleanup_deleted():
    from aleph.model import Alert, Entity, Collection
    from aleph.model import Permission, Role
    Alert.cleanup_deleted()
    Permission.cleanup_deleted()
    Entity.cleanup_deleted()
    Collection.cleanup_deleted()
    Role.cleanup_deleted()
    db.session.commit()
Exemplo n.º 15
0
 def setUp(self):
     super(EntitiesTestCase, self).setUp()
     self.rolex = self.create_user(foreign_id='user_3')
     self.col = Collection()
     self.col.label = 'Original Collection'
     self.col.foreign_id = 'test_coll_entities'
     db.session.add(self.col)
     self.col_other = Collection()
     self.col_other.label = 'Other Collection'
     self.col_other.foreign_id = 'test_coll_entities_other'
     db.session.add(self.col_other)
     db.session.flush()
     self.ent = Entity.save(
         {
             'name':
             'Winnie the Pooh',
             'jurisdiction_code':
             'pa',
             'summary':
             'a fictional teddy bear created by author A. A. Milne',
             'identifiers': [{
                 'scheme': 'wikipedia',
                 'identifier': 'en:Winnie-the-Pooh'
             }],
             'other_names': [{
                 'name': u'Puh der Bär'
             }, {
                 'name': 'Pooh Bear'
             }]
         }, [self.col])
     db.session.add(self.ent)
     db.session.flush()
     self.other = Entity.save(
         {
             'name':
             'Pu der Bär',
             'jurisdiction_code':
             'de',
             'description':
             'he is a bear',
             'identifiers': [{
                 'scheme': 'wikipedia',
                 'identifier': 'en:Winnie-the-Pooh'
             }, {
                 'scheme': 'animals',
                 'identifier': 'bears.winnie.pooh'
             }],
             'other_names': [{
                 'name': u'Puh der Bär'
             }]
         }, [self.col_other])
     db.session.add(self.other)
     self.alert = Alert()
     self.alert.entity = self.other
     db.session.add(self.alert)
     db.session.commit()
Exemplo n.º 16
0
def merge(id, other_id):
    entity = obj_or_404(Entity.by_id(id))
    check_authz(entity, authz.WRITE)
    other = obj_or_404(Entity.by_id(other_id))
    check_authz(other, authz.WRITE)
    entity.merge(other)
    db.session.commit()
    update_entity(entity)
    update_entity(other)
    return view(entity.id)
Exemplo n.º 17
0
def merge(id, other_id):
    entity = obj_or_404(Entity.by_id(id))
    check_authz(entity, authz.WRITE)
    other = obj_or_404(Entity.by_id(other_id))
    check_authz(other, authz.WRITE)
    entity.merge(other)
    db.session.commit()
    update_entity(entity)
    update_entity(other)
    return view(entity.id)
Exemplo n.º 18
0
def delete_collection(collection, sync=False):
    reset_collection(collection, sync=False)
    flush_notifications(collection)
    deleted_at = collection.deleted_at or datetime.utcnow()
    Entity.delete_by_collection(collection.id, deleted_at=deleted_at)
    Document.delete_by_collection(collection.id)
    Permission.delete_by_collection(collection.id, deleted_at=deleted_at)
    collection.delete(deleted_at=deleted_at)
    db.session.commit()
    index.delete_collection(collection.id, sync=sync)
    Authz.flush()
Exemplo n.º 19
0
    def load_entity(self, fk, name, schema):
        entity = Entity.by_foreign_id(fk, self.collection.id, deleted=True)
        if entity is not None:
            return entity

        return Entity.save({
            'name': name,
            'schema': schema,
            'foreign_ids': [fk],
            'state': Entity.STATE_PENDING,
            'data': {}
        }, self.collection)
Exemplo n.º 20
0
def load_fixture(name):
    dir_name = os.path.join(fixtures_path, name)
    if not os.path.isdir(dir_name):
        raise ValueError("No such directory: %r" % dir_name)

    with open(os.path.join(dir_name, 'mapping.yaml'), 'rb') as fh:
        data = yaml.load(fh)

    lst = List.by_label(data.get('list'))
    selectors = set()
    if lst is not None:
        selectors = lst.terms
        lst.delete()
        db.session.commit()

    lst = List.create(
        {
            'label': data.get('list'),
            'public': data.get('public'),
            'users': []
        }, None)
    log.info("Loading %r", lst)

    mapping = data.get('mapping')
    default_category = data.get('default_category')
    assert default_category in CATEGORIES, default_category

    entities = defaultdict(set)
    with open(os.path.join(dir_name, 'data.csv'), 'rb') as fh:
        for row in unicodecsv.DictReader(fh):
            label = row.get(mapping.get('label', 'label'))
            if label is None:
                continue

            category = row.get(mapping.get('category', 'category'))
            category = category or default_category

            selectors = [row.get(mapping.get('selector', 'selector'))]
            selectors = [s for s in selectors if s]
            entities[(label, category)].update(selectors)

    for (label, category), selectors in entities.items():
        data = {
            'label': label,
            'category': category,
            'selectors': selectors,
            'list': lst
        }
        try:
            Entity.create(data, None)
        except Invalid, inv:
            log.warn("Failed: %s", inv)
Exemplo n.º 21
0
def cleanup_deleted():
    from aleph.model import Alert, Entity, Collection
    from aleph.model import Permission, Role, Document
    from aleph.model import Diagram, Mapping
    Mapping.cleanup_deleted()
    Diagram.cleanup_deleted()
    Document.cleanup_deleted()
    Alert.cleanup_deleted()
    Permission.cleanup_deleted()
    Entity.cleanup_deleted()
    Collection.cleanup_deleted()
    Role.cleanup_deleted()
    db.session.commit()
Exemplo n.º 22
0
def update(id):
    entity = obj_or_404(Entity.by_id(id))
    check_authz(entity, authz.WRITE)
    data = request_data()
    data['id'] = entity.id
    possible_collections = authz.collections(authz.WRITE)
    possible_collections.extend([c.id for c in entity.collections])
    data['collections'] = [c for c in get_collections(data)
                           if c.id in possible_collections]
    entity = Entity.save(data, merge=arg_bool('merge'))
    db.session.commit()
    update_entity(entity)
    return view(entity.id)
Exemplo n.º 23
0
def delete_collection(collection, sync=False):
    flush_notifications(collection)
    drop_aggregator(collection)
    deleted_at = collection.deleted_at or datetime.utcnow()
    Entity.delete_by_collection(collection.id, deleted_at=deleted_at)
    Match.delete_by_collection(collection.id, deleted_at=deleted_at)
    Permission.delete_by_collection(collection.id, deleted_at=deleted_at)
    collection.delete(deleted_at=deleted_at)
    db.session.commit()
    index.delete_collection(collection.id, sync=sync)
    index.delete_entities(collection.id, sync=False)
    refresh_collection(collection.id)
    Authz.flush()
Exemplo n.º 24
0
 def setUp(self):
     super(CollectionsApiTestCase, self).setUp()
     self.rolex = self.create_user(foreign_id='user_3')
     self.col = Collection()
     self.col.label = 'Test Collection'
     self.col.foreign_id = 'test_coll_entities_api'
     db.session.add(self.col)
     db.session.flush()
     self.ent = Entity()
     self.ent.collection = self.col
     self.ent.update({'name': 'Winnie the Pooh'})
     db.session.add(self.ent)
     db.session.commit()
Exemplo n.º 25
0
def update(id):
    entity = obj_or_404(Entity.by_id(id))
    check_authz(entity, authz.WRITE)
    data = request_data()
    data['id'] = entity.id
    possible_collections = authz.collections(authz.WRITE)
    possible_collections.extend([c.id for c in entity.collections])
    data['collections'] = [
        c for c in get_collections(data) if c.id in possible_collections
    ]
    entity = Entity.save(data, merge=arg_bool('merge'))
    db.session.commit()
    update_entity(entity)
    return view(entity.id)
Exemplo n.º 26
0
 def setUp(self):
     super(EntitiesTestCase, self).setUp()
     self.rolex = self.create_user(foreign_id='user_3')
     self.col = Collection()
     self.col.label = 'Original Collection'
     self.col.foreign_id = 'test_coll_entities'
     db.session.add(self.col)
     self.col_other = Collection()
     self.col_other.label = 'Other Collection'
     self.col_other.foreign_id = 'test_coll_entities_other'
     db.session.add(self.col_other)
     db.session.flush()
     self.ent = Entity.save({
         'name': 'Winnie the Pooh',
         'collections': [self.col],
         'jurisdiction_code': 'pa',
         'summary': 'a fictional teddy bear created by author A. A. Milne',
         'identifiers': [{
             'scheme': 'wikipedia',
             'identifier': 'en:Winnie-the-Pooh'
         }],
         'other_names': [{
             'name': u'Puh der Bär'
         }, {
             'name': 'Pooh Bear'
         }]
     })
     db.session.add(self.ent)
     db.session.flush()
     self.other = Entity.save({
         'name': 'Pu der Bär',
         'collections': [self.col_other],
         'jurisdiction_code': 'de',
         'description': 'he is a bear',
         'identifiers': [{
             'scheme': 'wikipedia',
             'identifier': 'en:Winnie-the-Pooh'
         }, {
             'scheme': 'animals',
             'identifier': 'bears.winnie.pooh'
         }],
         'other_names': [{
             'name': u'Puh der Bär'
         }]
     })
     db.session.add(self.other)
     self.alert = Alert()
     self.alert.entity = self.other
     db.session.add(self.alert)
     db.session.commit()
Exemplo n.º 27
0
def delete_collection(collection, keep_metadata=False, sync=False):
    reset_collection(collection, sync=False)
    deleted_at = collection.deleted_at or datetime.utcnow()
    Entity.delete_by_collection(collection.id, deleted_at=deleted_at)
    Mapping.delete_by_collection(collection.id, deleted_at=deleted_at)
    Diagram.delete_by_collection(collection.id, deleted_at=deleted_at)
    Document.delete_by_collection(collection.id)
    if not keep_metadata:
        Permission.delete_by_collection(collection.id, deleted_at=deleted_at)
        collection.delete(deleted_at=deleted_at)
    db.session.commit()
    if not keep_metadata:
        index.delete_collection(collection.id, sync=sync)
        Authz.flush()
    refresh_collection(collection.id, sync=True)
Exemplo n.º 28
0
    def load_fixtures(self):
        self.admin = self.create_user(foreign_id='admin', is_admin=True)
        self.private_coll = self.create_collection(foreign_id='test_private',
                                                   label="Private Collection",
                                                   category='grey',
                                                   casefile=False,
                                                   creator=self.admin)
        self._banana = Entity.create(
            {
                'schema': 'Person',
                'properties': {
                    'name': ['Banana'],
                }
            }, self.private_coll)
        user = Role.by_foreign_id(Role.SYSTEM_USER)
        Permission.grant(self.private_coll, user, True, False)
        self.public_coll = self.create_collection(foreign_id='test_public',
                                                  label="Public Collection",
                                                  category='news',
                                                  casefile=False,
                                                  creator=self.admin)
        self._kwazulu = Entity.create(
            {
                'schema': 'Company',
                'properties': {
                    'name': ['KwaZulu'],
                    'alias': ['kwazulu']
                }
            }, self.public_coll)
        visitor = Role.by_foreign_id(Role.SYSTEM_GUEST)
        Permission.grant(self.public_coll, visitor, True, False)
        db.session.commit()

        drop_aggregator(self.public_coll)
        stage = get_stage(self.public_coll, OP_PROCESS)
        process_collection(stage, self.public_coll, ingest=False, sync=True)

        aggregator = get_aggregator(self.private_coll)
        aggregator.delete()
        stage = get_stage(self.private_coll, OP_PROCESS)
        for sample in read_entities(self.get_fixture_path('samples.ijson')):
            aggregator.put(sample, fragment='sample')
            index_aggregate(stage,
                            self.private_coll,
                            entity_id=sample.id,
                            sync=True)
        aggregator.close()
        process_collection(stage, self.private_coll, ingest=False, sync=True)
Exemplo n.º 29
0
def delete(id):
    entity = obj_or_404(Entity.by_id(id))
    check_authz(entity, authz.WRITE)
    delete_entity(entity)
    db.session.commit()
    log_event(request, entity_id=entity.id)
    return jsonify({'status': 'ok'})
Exemplo n.º 30
0
Arquivo: spindle.py Projeto: 01-/aleph
    def crawl_collection(self, collection):
        if not len(collection.get('subjects', [])):
            return
        url = urljoin(self.URL, '/api/collections/%s' % collection.get('id'))
        collection = Collection.by_foreign_id(url, {
            'label': collection.get('title')
        })
        res = requests.get('%s/permissions' % url, headers=self.HEADERS)
        for perm in res.json().get('results', []):
            Permission.grant_foreign(collection, perm.get('role'),
                                     perm.get('read'), perm.get('write'))

        log.info(" > Spindle collection: %s", collection.label)
        res = requests.get('%s/entities' % url, headers=self.HEADERS)
        terms = set()
        existing_entities = []
        for entity in res.json().get('results', []):
            if entity.get('name') is None:
                continue
            aliases = [on.get('alias') for on in entity.get('other_names', [])]
            ent = Entity.by_foreign_id(entity.get('id'), collection, {
                'name': entity.get('name'),
                'category': SCHEMATA.get(entity.get('$schema'), OTHER),
                'data': entity,
                'selectors': aliases
            })
            terms.update(ent.terms)
            existing_entities.append(ent.id)
            log.info("  # %s (%s)", ent.name, ent.category)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()
        self.emit_collection(collection, terms)
Exemplo n.º 31
0
def all():
    q = Entity.all()
    q = q.filter(Entity.state == Entity.STATE_ACTIVE)
    clause = Collection.id.in_(authz.collections(authz.READ))
    q = q.filter(Entity.collections.any(clause))
    q = q.order_by(Entity.id.asc())
    return jsonify(Pager(q, limit=100))
Exemplo n.º 32
0
def delete(id):
    entity = obj_or_404(Entity.by_id(id))
    authz.require(authz.watchlist_write(entity.watchlist_id))
    entity.delete()
    db.session.commit()
    analyze_entity.delay(id)
    return jsonify({"status": "ok"})
Exemplo n.º 33
0
    def crawl_collection(self, collection):
        if not len(collection.get('subjects', [])):
            return
        url = urljoin(self.URL, '/api/collections/%s' % collection.get('id'))
        watchlist = Watchlist.by_foreign_id(url, {
            'label': collection.get('title')
        })
        res = requests.get('%s/permissions' % url, headers=self.HEADERS)
        for perm in res.json().get('results', []):
            Permission.grant_foreign(watchlist, perm.get('role'),
                                     perm.get('read'), perm.get('write'))

        log.info(" > Spindle collection: %s", watchlist.label)
        res = requests.get('%s/entities' % url, headers=self.HEADERS)
        previous_terms = watchlist.terms
        updated_terms = set()
        existing_entities = []
        for entity in res.json().get('results', []):
            if entity.get('name') is None:
                continue
            aliases = [on.get('alias') for on in entity.get('other_names', [])]
            ent = Entity.by_foreign_id(entity.get('id'), watchlist, {
                'name': entity.get('name'),
                'category': SCHEMATA.get(entity.get('$schema'), OTHER),
                'data': entity,
                'selectors': aliases
            })
            updated_terms.update(ent.terms)
            existing_entities.append(ent.id)
            log.info("  # %s (%s)", ent.name, ent.category)
        watchlist.delete_entities(spare=existing_entities)
        terms = previous_terms.symmetric_difference(updated_terms)
        self.emit_watchlist(watchlist, terms)
Exemplo n.º 34
0
    def build_automaton(self):
        q = Entity.all()
        q = q.filter(Entity.schema.in_(self.TYPES.keys()))

        matches = {}
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for name in entity.names:
                if name is None or len(name) > 120:
                    continue
                match = self.match_form(name)
                if match is None:
                    continue
                if match in matches:
                    matches[match].append((name, tag))
                else:
                    matches[match] = [(name, tag)]

        if not len(matches):
            return

        automaton = Automaton()
        for term, entities in matches.iteritems():
            automaton.add_word(term, entities)
        automaton.make_automaton()
        return automaton
Exemplo n.º 35
0
def delete(id):
    entity = obj_or_404(Entity.by_id(id))
    check_authz(entity, authz.WRITE)
    entity.delete()
    db.session.commit()
    update_entity(entity)
    return jsonify({'status': 'ok'})
Exemplo n.º 36
0
def transform_facets(aggregations):
    coll = aggregations.get('all', {}).get('ftr', {}).get('collections', {})
    coll = coll.get('buckets', [])

    lists = {}
    for list_id in get_list_facets(request.args):
        key = 'list_%s' % list_id
        ents = aggregations.get(key, {}).get('inner', {})
        ents = ents.get('entities', {}).get('buckets', [])
        objs = Entity.by_id_set([e.get('key') for e in ents])
        entities = []
        for entity in ents:
            entity['entity'] = objs.get(entity.get('key'))
            if entity['entity'] is not None:
                entities.append(entity)
        lists[list_id] = entities

    attributes = {}
    for attr in request.args.getlist('attributefacet'):
        key = 'attr_%s' % attr
        vals = aggregations.get(key, {}).get('inner', {})
        vals = vals.get('values', {}).get('buckets', [])
        attributes[attr] = vals

    return {
        'sources': coll,
        'lists': lists,
        'attributes': attributes
    }
Exemplo n.º 37
0
def load_entities():
    tx = get_graph().begin()
    q = Entity.all()
    q = q.filter(Entity.state == Entity.STATE_ACTIVE)
    for entity in q:
        load_entity(tx, entity)
    tx.commit()
Exemplo n.º 38
0
def records_query(document_id, args, size=5):
    terms = []
    text = args.get('q', '').strip()
    if len(text):
        terms.append(text)

    entities = Entity.by_id_set(args.getlist('entity'))
    for entity in entities.values():
        terms.extend(entity.terms)

    if not len(terms):
        return None

    shoulds = []
    for term in terms:
        shoulds.append({
            'match': {
                'text': {
                    'query': term,
                    'boost': 10,
                    'operator': 'and'
                }
            }
        })
        shoulds.append({
            'match': {
                'text_latin': {
                    'query': latinize_text(term),
                    'operator': 'and'
                }
            }
        })

    q = {
        'bool': {
            'minimum_should_match': 1,
            'should': shoulds
        }
    }
    if document_id is not None:
        q['bool']['must'] = {
            'term': {'document_id': document_id}
        }

    try:
        snippet = int(args.get('snippet', 150))
    except:
        snippet = 150

    return {
        'size': size,
        'query': q,
        'highlight': {
            'fields': {
                'text': {'fragment_size': snippet},
                'text_latin': {'fragment_size': snippet}
            }
        },
        '_source': ['document_id', 'sheet', 'row_id', 'page']
    }
Exemplo n.º 39
0
 def crawl(self):
     url = urljoin(self.host, '/ticket/all_closed/?format=json')
     watchlist = Watchlist.by_foreign_id(url, {
         'label': 'Investigative Dashboard Requests'
     })
     Permission.grant_foreign(watchlist, 'idashboard:occrp_staff',
                              True, False)
     existing_entities = []
     previous_terms = watchlist.terms
     updated_terms = set()
     db.session.flush()
     for endpoint in ['all_closed', 'all_open']:
         url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
         data = self.session.get(url).json()
         for req in data.get('paginator', {}).get('object_list'):
             category = REQUEST_TYPES.get(req.get('ticket_type'))
             if category is None:
                 continue
             ent = Entity.by_foreign_id(str(req.get('id')), watchlist, {
                 'name': req.get('name'),
                 'category': category,
                 'data': req,
                 'selectors': [req.get('name')]
             })
             updated_terms.update(ent.terms)
             existing_entities.append(ent.id)
             log.info("  # %s (%s)", ent.name, ent.category)
     watchlist.delete_entities(spare=existing_entities)
     terms = previous_terms.symmetric_difference(updated_terms)
     self.emit_watchlist(watchlist, terms)
Exemplo n.º 40
0
    def load_entity(self, name, schema):
        identifier = name.lower().strip()
        q = db.session.query(EntityIdentifier)
        q = q.order_by(EntityIdentifier.deleted_at.desc().nullsfirst())
        q = q.filter(EntityIdentifier.scheme == self.origin)
        q = q.filter(EntityIdentifier.identifier == identifier)
        ident = q.first()
        if ident is not None:
            if ident.deleted_at is None:
                # TODO: add to collections? Security risk here.
                return ident.entity_id
            if ident.entity.deleted_at is None:
                return None

        data = {
            'name': name,
            '$schema': schema,
            'state': Entity.STATE_PENDING,
            'identifiers': [{
                'scheme': self.origin,
                'identifier': identifier
            }]
        }
        entity = Entity.save(data, self.collections)
        return entity.id
Exemplo n.º 41
0
def all():
    q = Entity.all()
    q = q.filter(Entity.state == Entity.STATE_ACTIVE)
    clause = Collection.id.in_(authz.collections(authz.READ))
    q = q.filter(Entity.collections.any(clause))
    q = q.order_by(Entity.id.asc())
    return jsonify(Pager(q, limit=100))
Exemplo n.º 42
0
def delete(id):
    entity = obj_or_404(Entity.by_id(id))
    authz.require(authz.collection_write(entity.collection_id))
    entity.delete()
    db.session.commit()
    analyze_entity.delay(id)
    return jsonify({'status': 'ok'})
Exemplo n.º 43
0
def prune_entity(collection, entity_id=None, job_id=None):
    """Prune handles the full deletion of an entity outside of the HTTP request
    cycle. This involves cleaning up adjacent entities like xref results, notifications
    and so on."""
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    log.info("[%s] Prune entity: %s", collection, entity_id)
    for adjacent in index.iter_adjacent(collection.id, entity_id):
        log.warning("Recursive delete: %s", adjacent.get("id"))
        delete_entity(collection, adjacent, job_id=job_id)
    flush_notifications(entity_id, clazz=Entity)
    obj = Entity.by_id(entity_id, collection=collection)
    if obj is not None:
        obj.delete()
    doc = Document.by_id(entity_id, collection=collection)
    if doc is not None:
        doc.delete()
    EntitySetItem.delete_by_entity(entity_id)
    Mapping.delete_by_table(entity_id)
    xref_index.delete_xref(collection, entity_id=entity_id)
    aggregator = get_aggregator(collection)
    aggregator.delete(entity_id=entity_id)
    refresh_entity(collection, entity_id)
    collection.touch()
    db.session.commit()
Exemplo n.º 44
0
def test():
    from aleph.model import Entity
    graph = get_graph()
    tx = graph.begin()
    for entity_id in Entity.all_ids():
        remove_entity(tx, entity_id)
    tx.commit()
Exemplo n.º 45
0
    def crawl(self):
        url = urljoin(self.host, '/ticket/all_closed/?format=json')
        collection = Collection.by_foreign_id(url, {
            'label': 'Investigative Dashboard Requests'
        })
        Permission.grant_foreign(collection, 'idashboard:occrp_staff',
                                 True, False)
        existing_entities = []
        terms = set()
        db.session.flush()
        for endpoint in ['all_closed', 'all_open']:
            url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
            data = self.session.get(url).json()
            for req in data.get('paginator', {}).get('object_list'):
                category = REQUEST_TYPES.get(req.get('ticket_type'))
                if category is None:
                    continue
                ent = Entity.by_foreign_id(str(req.get('id')), collection, {
                    'name': req.get('name'),
                    'category': category,
                    'data': req,
                    'selectors': [req.get('name')]
                })
                terms.update(ent.terms)
                existing_entities.append(ent.id)
                log.info("  # %s (%s)", ent.name, ent.category)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()
        self.emit_collection(collection, terms)
Exemplo n.º 46
0
def fetch_entity(entity_id):
    """Load entities from both the ES index and the database."""
    entity = get_entity(entity_id)
    obj = Entity.by_id(entity_id)
    if obj is not None:
        entity['data'] = obj.data
    return entity, obj
Exemplo n.º 47
0
    def load_entity(self, name, schema):
        identifier = name.lower().strip()
        q = db.session.query(EntityIdentifier)
        q = q.order_by(EntityIdentifier.deleted_at.desc().nullsfirst())
        q = q.filter(EntityIdentifier.scheme == self.origin)
        q = q.filter(EntityIdentifier.identifier == identifier)
        ident = q.first()
        if ident is not None:
            if ident.deleted_at is None:
                # TODO: add to collections? Security risk here.
                return ident.entity_id
            if ident.entity.deleted_at is None:
                return None

        data = {
            'name': name,
            '$schema': schema,
            'state': Entity.STATE_PENDING,
            'identifiers': [{
                'scheme': self.origin,
                'identifier': identifier
            }]
        }
        entity = Entity.save(data, self.collections)
        return entity.id
Exemplo n.º 48
0
def analyze_collection(collection_id):
    """Re-analyze the elements of this collection, documents and entities."""
    Entity.delete_dangling(collection_id)
    db.session.commit()

    q = db.session.query(Collection).filter(Collection.id == collection_id)
    collection = q.first()
    if collection is None:
        log.error("No collection with ID: %r", collection_id)

    # re-process the documents
    analyze_documents(collection.id)

    # re-process entities
    for entity in collection.entities:
        update_entity_full(entity.id)
Exemplo n.º 49
0
 def emit_entity(self, collection, data):
     entity = Entity.save(data, [collection], merge=True)
     db.session.commit()
     log.info("Entity [%s]: %s", entity.id, entity.name)
     update_entity(entity)
     self.increment_count()
     return entity
Exemplo n.º 50
0
 def emit_entity(self, collection, data):
     entity = Entity.save(data, [collection], merge=True)
     db.session.commit()
     log.info("Entity [%s]: %s", entity.id, entity.name)
     update_entity(entity)
     self.increment_count()
     return entity
Exemplo n.º 51
0
def load_document(tx, document):
    if tx is None:
        return
    log.info("Graph load [%s]: %r", document.id, document.meta)
    meta = document.meta
    node = DocumentNode.merge(tx, name=meta.title, alephTitle=document.type,
                              fileName=meta.file_name, fingerprint=document.id,
                              alephDocument=document.id)
    add_to_collections(tx, node, document.collections,
                       alephDocument=document.id)

    for email in meta.emails:
        enode = EmailNode.merge(tx, name=email, fingerprint=email)
        MENTIONS.merge(tx, node, enode, alephDocument=document.id)
        add_to_collections(tx, enode, document.collections,
                           alephDocument=document.id)

    for phone in meta.phone_numbers:
        pnode = PhoneNode.merge(tx, name=phone, fingerprint=phone)
        MENTIONS.merge(tx, node, pnode, alephDocument=document.id)
        add_to_collections(tx, pnode, document.collections,
                           alephDocument=document.id)

    for entity in Entity.all_by_document(document.id):
        enode = load_entity(tx, entity)
        MENTIONS.merge(tx, node, enode,
                       alephDocument=document.id,
                       alephEntity=entity.id)
    return node
Exemplo n.º 52
0
def delete(id):
    entity = obj_or_404(Entity.by_id(id))
    authz.require(authz.collection_write(entity.collection_id))
    entity.delete()
    db.session.commit()
    analyze_entity.delay(id)
    return jsonify({'status': 'ok'})
Exemplo n.º 53
0
def format_results(query):
    sources = {}
    entities = {}
    results = []
    for row in raw_iter(query):
        src = row.get('_source')
        data = {}
        for name, value in src.items():
            if isinstance(value, dict) or name in SKIP_FIELDS:
                continue
            if name == 'entities':
                load_ids = []
                for entity_id in value:
                    if entity_id not in entities:
                        load_ids.append(entity_id)
                if len(load_ids):
                    for id, ent in Entity.by_id_set(load_ids).items():
                        entities[id] = ent.name

                value = ', '.join([entities.get(e) for e in value
                                   if entities.get(e) is not None])
            if isinstance(value, (list, tuple, set)):
                value = ', '.join(value)
            if name == 'source_id':
                # WARNING: don't to one query per row
                if value not in sources:
                    source = Source.by_id(value)
                    if source is None:
                        sources[value] = '[Deleted source %s]' % value
                    else:
                        sources[value] = source.label
                value = sources[value]
            data[name] = value
            results.append(data)
    return results
Exemplo n.º 54
0
def load_fixture(name):
    dir_name = os.path.join(fixtures_path, name)
    if not os.path.isdir(dir_name):
        raise ValueError("No such directory: %r" % dir_name)

    with open(os.path.join(dir_name, 'mapping.yaml'), 'rb') as fh:
        data = yaml.load(fh)

    lst = List.by_label(data.get('list'))
    selectors = set()
    if lst is not None:
        selectors = lst.terms
        lst.delete()
        db.session.commit()

    lst = List.create({
        'label': data.get('list'),
        'public': data.get('public'),
        'users': []
    }, None)
    log.info("Loading %r", lst)

    mapping = data.get('mapping')
    default_category = data.get('default_category')
    assert default_category in CATEGORIES, default_category

    entities = defaultdict(set)
    with open(os.path.join(dir_name, 'data.csv'), 'rb') as fh:
        for row in unicodecsv.DictReader(fh):
            label = row.get(mapping.get('label', 'label'))
            if label is None:
                continue

            category = row.get(mapping.get('category', 'category'))
            category = category or default_category

            selectors = [row.get(mapping.get('selector', 'selector'))]
            selectors = [s for s in selectors if s]
            entities[(label, category)].update(selectors)

    for (label, category), selectors in entities.items():
        data = {'label': label, 'category': category,
                'selectors': selectors, 'list': lst}
        try:
            Entity.create(data, None)
        except Invalid, inv:
            log.warn("Failed: %s", inv)
Exemplo n.º 55
0
def delete(id):
    entity = obj_or_404(Entity.by_id(id))
    authz.require(authz.list_write(entity.list_id))
    selectors = entity.terms
    entity.delete()
    db.session.commit()
    refresh_selectors.delay(list(selectors))
    return jsonify({"status": "ok"})
Exemplo n.º 56
0
def create():
    data = EntityForm().deserialize(request_data())
    authz.require(data["list"])
    authz.require(authz.list_write(data["list"].id))
    entity = Entity.create(data, current_user)
    db.session.commit()
    refresh_selectors.delay(list(entity.terms))
    return view(entity.id)
Exemplo n.º 57
0
 def emit_entity(self, collection, data):
     data['collections'] = [collection]
     entity = Entity.save(data, merge=True)
     db.session.flush()
     update_entity_full.delay(entity.id)
     log.info("Entity [%s]: %s", entity.id, entity.name)
     self.entity_cache[collection.id].append(entity)
     return entity
Exemplo n.º 58
0
def all():
    collection_id = request.args.getlist('collection_id')
    collection_id = authz.collections_intersect(authz.READ, collection_id)
    q = Entity.all_ids()
    q = q.filter(Entity.state == Entity.STATE_ACTIVE)
    q = q.filter(Entity.deleted_at == None)  # noqa
    clause = Collection.id.in_(collection_id)
    q = q.filter(Entity.collections.any(clause))
    return jsonify({'results': [r[0] for r in q]})
Exemplo n.º 59
0
 def _generate():
     for entity in Entity.by_collection(collection.id):
         entity_id, index, body = index_operation(entity.to_dict())
         yield {
             '_id': entity_id,
             '_index': index,
             '_source': body
         }
     yield from generate_collection_docs(collection)