Exemplo n.º 1
0
    def test_load_sqlite(self):
        count = Collection.all().count()
        assert 0 == count, count

        db_uri = 'sqlite:///' + self.get_fixture_path('kek.sqlite')
        os.environ['ALEPH_TEST_BULK_DATABASE_URI'] = db_uri
        yml_path = self.get_fixture_path('kek.yml')
        config = load_config_file(yml_path)
        bulk_load(config)

        count = Collection.all().count()
        assert 1 == count, count

        coll = Collection.by_foreign_id('kek')
        assert coll.category == 'scrape', coll.category

        _, headers = self.login(is_admin=True)
        flush_index()

        res = self.client.get('/api/2/entities?q=friede+springer',
                              headers=headers)
        assert res.status_code == 200, res
        assert res.json['total'] == 1, res.json
        res0 = res.json['results'][0]
        assert res0['id'] == '9895ccc1b3d6444ccc6371ae239a7d55c748a714', res0
Exemplo n.º 2
0
    def test_load_csv(self):
        count = Collection.all().count()
        assert 0 == count, count

        db_uri = 'file://' + self.get_fixture_path('experts.csv')
        os.environ['ALEPH_TEST_BULK_CSV'] = db_uri
        yml_path = self.get_fixture_path('experts.yml')
        config = load_config_file(yml_path)
        bulk_load(config)

        coll = Collection.by_foreign_id('experts')
        assert coll.category == 'scrape', coll.category

        _, headers = self.login(is_admin=True)
        self.flush_index()

        count = Collection.all().count()
        assert 1 == count, count

        res = self.client.get('/api/2/entities?q=Greenfield',
                              headers=headers)
        assert res.status_code == 200, res
        assert res.json['total'] == 1, res.json
        res0 = res.json['results'][0]
        assert res0['id'] == '6897ef1acd633c229d812c1c495f030d212c9081', res0
Exemplo n.º 3
0
def compute_collections():
    """Update collection caches, including the global stats cache."""
    authz = Authz.from_role(None)
    schemata = defaultdict(int)
    countries = defaultdict(int)
    categories = defaultdict(int)

    for collection in Collection.all():
        compute_collection(collection)

        if authz.can(collection.id, authz.READ):
            categories[collection.category] += 1
            things = index.get_collection_things(collection.id)
            for schema, count in things.items():
                schemata[schema] += count
            for country in collection.countries:
                countries[country] += 1

    log.info("Updating global statistics cache...")
    data = {
        "collections": sum(categories.values()),
        "schemata": dict(schemata),
        "countries": dict(countries),
        "categories": dict(categories),
        "things": sum(schemata.values()),
    }
    key = cache.key(cache.STATISTICS)
    cache.set_complex(key, data, expires=cache.EXPIRE)
Exemplo n.º 4
0
    def test_load_sqlite(self):
        count = Collection.all().count()
        assert 0 == count, count

        yml_path = self.get_fixture_path('kek.yml')
        config = load_config_file(yml_path)
        bulk_load(config)
        flush_index()

        count = Collection.all().count()
        assert 1 == count, count

        res = self.client.get('/api/2/entities?q=friede+springer')
        assert res.status_code == 200, res
        assert res.json['total'] == 1, res.json
        res0 = res.json['results'][0]
        assert res0['id'] == '9895ccc1b3d6444ccc6371ae239a7d55c748a714', res0
Exemplo n.º 5
0
def upgrade_collections():
    for collection in Collection.all(deleted=True):
        if collection.deleted_at is not None:
            delete_collection(collection, keep_metadata=True, sync=True)
        else:
            compute_collection(collection, force=True)
    # update global cache:
    compute_collections()
Exemplo n.º 6
0
def upgrade_collections():
    for collection in Collection.all(deleted=True):
        if collection.deleted_at is not None:
            delete_collection(collection, keep_metadata=True,
                              sync=True, reset_sync=True)
        else:
            refresh_collection(collection.id, sync=True)
            compute_collection(collection, sync=True)
Exemplo n.º 7
0
def index_collections(entities=False, refresh=False):
    q = Collection.all(deleted=True)
    q = q.order_by(Collection.updated_at.desc())
    for collection in q:
        log.info("Index [%s]: %s", collection.id, collection.label)
        if entities and collection.deleted_at is None:
            index_collection_entities.delay(collection_id=collection.id)
        if refresh:
            refresh_collection(collection.id, sync=False)
        index.index_collection(collection)
Exemplo n.º 8
0
def analyze(foreign_id=None):
    """Re-analyze documents in the given collection (or throughout)."""
    if foreign_id:
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            raise ValueError("No such collection: %r" % foreign_id)
        analyze_collection.delay(collection.id)
    else:
        for collection in Collection.all():
            analyze_collection.delay(collection.id)
Exemplo n.º 9
0
def analyze(foreign_id=None):
    """Re-analyze documents in the given collection (or throughout)."""
    if foreign_id:
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            raise ValueError("No such collection: %r" % foreign_id)
        analyze_collection.delay(collection.id)
    else:
        for collection in Collection.all():
            analyze_collection.delay(collection.id)
Exemplo n.º 10
0
    def test_load_csv(self):
        count = Collection.all().count()
        assert 0 == count, count

        db_uri = 'file://' + self.get_fixture_path('experts.csv')
        os.environ['ALEPH_TEST_BULK_CSV'] = db_uri
        yml_path = self.get_fixture_path('experts.yml')
        config = load_config_file(yml_path)
        bulk_load(config)

        coll = Collection.by_foreign_id('experts')
        assert coll.category == 'scrape', coll.category

        _, headers = self.login(is_admin=True)
        count = Collection.all().count()
        assert 1 == count, count

        url = '/api/2/entities?filter:schemata=Thing&q=Greenfield'
        res = self.client.get(url, headers=headers)
        assert res.status_code == 200, res
        assert res.json['total'] == 1, res.json
Exemplo n.º 11
0
def collections(secret, casefile):
    """List all collections."""
    collections = []
    for coll in Collection.all():
        if secret is not None:
            if coll.secret != secret:
                continue
        if casefile is not None:
            if coll.casefile != casefile:
                continue
        collections.append((coll.foreign_id, coll.id, coll.label))
    print(tabulate(collections, headers=["Foreign ID", "ID", "Label"]))
Exemplo n.º 12
0
    def test_load_csv(self):
        count = Collection.all().count()
        assert 0 == count, count

        db_uri = 'file://' + self.get_fixture_path('experts.csv')
        os.environ['ALEPH_TEST_BULK_CSV'] = db_uri
        yml_path = self.get_fixture_path('experts.yml')
        config = load_config_file(yml_path)
        bulk_load(config)

        coll = Collection.by_foreign_id('experts')
        assert coll.category == 'scrape', coll.category

        _, headers = self.login(is_admin=True)
        count = Collection.all().count()
        assert 1 == count, count

        url = '/api/2/entities?filter:schemata=Thing&q=Greenfield'
        res = self.client.get(url, headers=headers)
        assert res.status_code == 200, res
        assert res.json['total'] == 1, res.json
Exemplo n.º 13
0
def peek_query(state):
    """Peek into hidden collections.

    This allows users to retrieve an approximate result count of a given query
    against those collections which they are not authorised to view. It is a
    rudimentary collaboration mechanism.
    """
    filters = state.filters
    cq = Collection.all()
    cq = cq.filter(not_(Collection.id.in_(state.authz.collections_read)))
    cq = cq.filter(Collection.creator_id != None)  # noqa
    cq = cq.filter(Collection.private != True)  # noqa
    collections = {c.id: c for c in cq}
    filters['collection_id'] = collections.keys()

    q = text_query(state.text)
    q = {
        'query': filter_query(q, filters),
        'query': q,
        'size': 0,
        'aggregations': {
            'collections': {
                'terms': {'field': 'collection_id', 'size': 1000}
            }
        },
        '_source': False
    }
    result = es.search(index=es_index, body=q, doc_type=TYPE_DOCUMENT)
    roles = {}
    total = 0
    aggs = result.get('aggregations', {}).get('collections', {})
    for bucket in aggs.get('buckets', []):
        collection = collections.get(bucket.get('key'))
        if collection is None or collection.creator is None:
            continue
        total += bucket.get('doc_count')
        if collection.creator_id in roles:
            roles[collection.creator_id]['total'] += bucket.get('doc_count')
        else:
            roles[collection.creator_id] = {
                'name': collection.creator.name,
                'email': collection.creator.email,
                'total': bucket.get('doc_count')
            }

    roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True)
    roles = [format_total(r) for r in roles]
    return format_total({
        'roles': roles,
        'active': total > 0,
        'total': total
    })
Exemplo n.º 14
0
    def test_load_sqlite(self):
        count = Collection.all().count()
        assert 0 == count, count

        db_uri = 'sqlite:///' + self.get_fixture_path('kek.sqlite')
        os.environ['ALEPH_TEST_BULK_DATABASE_URI'] = db_uri
        yml_path = self.get_fixture_path('kek.yml')
        config = load_config_file(yml_path)
        bulk_load(config)

        count = Collection.all().count()
        assert 1 == count, count

        coll = Collection.by_foreign_id('kek')
        assert coll.category == 'scrape', coll.category

        _, headers = self.login(is_admin=True)
        url = '/api/2/entities?filter:schemata=Thing&q=friede+springer'
        res = self.client.get(url, headers=headers)
        assert res.status_code == 200, res
        assert res.json['total'] == 1, res.json
        res0 = res.json['results'][0]
        key = '9895ccc1b3d6444ccc6371ae239a7d55c748a714'
        assert res0['id'].startswith(key), res0
Exemplo n.º 15
0
Arquivo: manage.py Projeto: wdsn/aleph
def update(foreign_id=None, index=False, process=False, reset=False):
    """Re-index all the collections and entities."""
    update_roles()
    q = Collection.all(deleted=True)
    if foreign_id is not None:
        q = [get_collection(foreign_id)]
    for collection in q:
        if reset:
            reset_collection(collection, sync=True)
        refresh_collection(collection.id)
        index_collection(collection)
        if collection.deleted_at is not None:
            continue
        if index or process:
            payload = {'ingest': process}
            queue_task(collection, OP_PROCESS, payload=payload)
Exemplo n.º 16
0
def index_collections(sync=False):
    for collection in Collection.all(deleted=True):
        compute_collection(collection, sync=sync)
Exemplo n.º 17
0
def load_collections():
    tx = get_graph().begin()
    for collection in Collection.all():
        log.info("Index collection: %s", collection.label)
        load_collection(tx, collection)
    tx.commit()
Exemplo n.º 18
0
def reindex_full(flush=False):
    """Re-index all collections."""
    for collection in Collection.all():
        _reindex_collection(collection, flush=flush)
Exemplo n.º 19
0
def index_collections(refresh=False):
    for collection in Collection.all(deleted=True):
        if refresh:
            refresh_collection(collection.id, sync=True)
        index.index_collection(collection)
Exemplo n.º 20
0
def index_collections(entities=False, refresh=False):
    q = Collection.all(deleted=True)
    q = q.order_by(Collection.updated_at.desc())
    for collection in q:
        index_collection(collection, entities=entities, refresh=refresh)
Exemplo n.º 21
0
def upgrade_collections():
    for collection in Collection.all(deleted=True):
        if collection.deleted_at is not None:
            delete_collection(collection, keep_metadata=True, sync=True)
            continue
        compute_collection(collection, sync=True)
Exemplo n.º 22
0
def index_collections():
    for collection in Collection.all(deleted=True):
        index.index_collection(collection)
Exemplo n.º 23
0
def compute_collections():
    for collection in Collection.all():
        compute_collection(collection)
Exemplo n.º 24
0
def collections():
    """List all collections."""
    for collection in Collection.all():
        print collection.id, collection.foreign_id, collection.label
Exemplo n.º 25
0
def cleanup_collections():
    """Reindex collections periodically."""
    for collection in Collection.all():
        update_collection(collection)
Exemplo n.º 26
0
def exportbalkhash(foreign_id=None):
    collections = Collection.all()
    if foreign_id is not None:
        collections = [get_collection(foreign_id)]
    for collection in collections:
        _export_balkhash_collection(collection)
Exemplo n.º 27
0
def collections():
    """List all collections."""
    collections = []
    for coll in Collection.all():
        collections.append((coll.foreign_id, coll.id, coll.label))
    print(tabulate(collections, headers=['Foreign ID', 'ID', 'Label']))
Exemplo n.º 28
0
def compute_collections():
    for collection in Collection.all():
        compute_collection(collection, sync=False)
Exemplo n.º 29
0
def collections():
    """List all collections."""
    for collection in Collection.all():
        print collection.id, collection.foreign_id, collection.label
Exemplo n.º 30
0
def index_collections():
    for collection in Collection.all(deleted=True):
        log.info("Index [%s]: %s", collection.id, collection.label)
        index.index_collection(collection)