def create(): require(request.authz.logged_in) data = parse_request(CollectionSchema) role = Role.by_id(request.authz.id) collection = create_collection(data, role=role) refresh_index(collections_index()) return view(collection.id)
def delete_collection(collection_id): """Delete all documents from a particular collection.""" es.delete(index=collections_index(), doc_type='doc', refresh=True, id=collection_id, ignore=[404])
def get_instance_stats(authz): # Compute entity stats: query = { 'size': 0, 'query': { 'bool': { 'filter': [authz_query(authz), { 'term': { 'schemata': Entity.THING } }] } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } } } } result = es.search(index=entities_index(), body=query) aggregations = result.get('aggregations') data = {'count': result.get('hits').get('total'), 'schemata': {}} for schema in aggregations.get('schema').get('buckets'): key = schema.get('key') data['schemata'][key] = schema.get('doc_count') # Compute collection stats (should we return categories?) query = {'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}} result = es.search(index=collections_index(), body=query) data['collections'] = result.get('hits').get('total') return data
def delete_collection(collection_id, sync=False): """Delete all documents from a particular collection.""" es.delete(collections_index(), doc_type='doc', id=str(collection_id), refresh=sync, ignore=[404])
def delete_collection(collection_id, wait=True): """Delete all documents from a particular collection.""" delete_entities(collection_id, wait=wait) delete_documents(collection_id, wait=wait) es.delete(index=collections_index(), doc_type='doc', id=collection_id, ignore=[404])
def get_collection(collection_id): """Fetch a collection from the index.""" result = es.get(index=collections_index(), doc_type='doc', id=collection_id, ignore=[404], _source_exclude=['text']) return unpack_result(result)
def delete_collection(collection_id, wait=True): """Delete all documents from a particular collection.""" query = {'term': {'collection_id': collection_id}} query_delete(records_index(), query, wait=wait) query_delete(entities_index(), query, wait=wait) es.delete(index=collections_index(), doc_type=collection_type(), id=collection_id, ignore=[404])
def configure_collections(): mapping = { "dynamic_templates": [ { "fields": { "match": "schemata.*", "mapping": {"type": "long"} } } ], "properties": { "label": { "type": "text", "analyzer": "icu_latin", "fields": {"kw": KEYWORD} }, "collection_id": KEYWORD, "foreign_id": KEYWORD, "languages": KEYWORD, "countries": KEYWORD, "category": KEYWORD, "summary": RAW_TEXT, "publisher": KEYWORD, "publisher_url": KEYWORD, "data_url": KEYWORD, "info_url": KEYWORD, "kind": KEYWORD, "text": LATIN_TEXT, "casefile": {"type": "boolean"}, "secret": {"type": "boolean"}, "created_at": {"type": "date"}, "updated_at": {"type": "date"}, "count": {"type": "long"}, "schemata": {"type": "object"}, "creator": { "type": "object", "properties": { "id": KEYWORD, "type": KEYWORD, "name": { "type": "text", "fields": {"kw": KEYWORD} } } }, "team": { "type": "object", "properties": { "id": KEYWORD, "type": KEYWORD, "name": KEYWORD } }, } } configure_index(collections_index(), mapping, index_settings())
def _resolve_index(self, cache): queries = OrderedDict() for (type_, id_) in cache.keys(): if type_ in [Collection]: index = collections_index() queries[(type_, id_)] = {'_index': index, '_id': id_} elif type_ in [Document, Entity]: index = entities_index() queries[(type_, id_)] = {'_index': index, '_id': id_} if not len(queries): return results = es.mget(body={'docs': queries.values()}, _source_exclude=['text']) for key, doc in zip(queries.keys(), results['docs']): cache[key] = unpack_result(doc)
def upgrade_search(): """Add any missing properties to the index mappings.""" INDEXES = [ (collections_index(), COLLECTION_MAPPING), (entity_index(), ENTITY_MAPPING), (record_index(), RECORD_MAPPING), ] for (index, mapping) in INDEXES: log.info("Creating index: %s", index) settings = deepcopy(INDEX_SETTINGS) if index == record_index(): # optimise records for bulk write settings['index']['refresh_interval'] = '-1' es.indices.create(index, body=settings, ignore=[404, 400]) es.indices.put_mapping(index=index, doc_type='doc', body=mapping) es.indices.open(index=index, ignore=[400, 404]) es.indices.refresh(index=index, ignore=[400, 404]) es.indices.clear_cache(index=index, ignore=[400, 404])
def _resolve_index(self, cache): queries = [] for (type_, id_) in cache.keys(): if type_ in [Collection]: index = collections_index() query = {'_index': index, '_id': id_} queries.append(((type_, id_), query)) elif type_ in [Document, Entity]: for index in entities_index_list(): query = {'_index': index, '_id': id_} queries.append(((type_, id_), query)) if not len(queries): return results = es.mget(body={'docs': [q[1] for q in queries]}, _source_exclude=['text']) for (key, _), doc in zip(queries, results['docs']): if cache.get(key) is None: cache[key] = unpack_result(doc)
def get_instance_stats(authz): # Compute entity stats: query = { 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), # {'term': {'schemata': Entity.THING}} ] } } } entities = es.search(index=entities_index(), body=query) # Compute collection stats (should we return categories?) query = {'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}} collections = es.search(index=collections_index(), body=query) return { 'entities': entities.get('hits').get('total'), 'collections': collections.get('hits').get('total') }
def get_index(self): return collections_index()
def index_collection(collection): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = { 'foreign_id': collection.foreign_id, 'created_at': collection.created_at, 'updated_at': collection.updated_at, 'label': collection.label, 'kind': collection.kind, 'summary': collection.summary, 'category': Collection.DEFAULT, 'publisher': collection.publisher, 'publisher_url': collection.publisher_url, 'info_url': collection.info_url, 'data_url': collection.data_url, 'casefile': collection.casefile, 'roles': collection.roles, 'schemata': {}, 'team': [] } texts = [v for v in data.values() if isinstance(v, str)] if collection.category in Collection.CATEGORIES: data['category'] = collection.category if collection.creator is not None: data['creator'] = { 'id': collection.creator.id, 'type': collection.creator.type, 'name': collection.creator.name } texts.append(collection.creator.name) for role in collection.team: data['team'].append({ 'id': role.id, 'type': role.type, 'name': role.name }) texts.append(role.name) # Compute some statistics on the content of a collection. query = { 'size': 0, 'query': { 'bool': { 'filter': [{ 'term': { 'collection_id': collection.id } }, { 'term': { 'schemata': Entity.THING } }] } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } }, 'countries': { 'terms': { 'field': 'countries', 'size': 500 } }, 'languages': { 'terms': { 'field': 'languages', 'size': 100 } }, } } result = search_safe(index=entities_index(), body=query) aggregations = result.get('aggregations') data['count'] = result['hits']['total'] # expose entities by schema count. for schema in aggregations['schema']['buckets']: data['schemata'][schema['key']] = schema['doc_count'] # if no countries or langs are given, take the most common from the data. countries = collection.countries if countries is None or not len(countries): countries = aggregations['countries']['buckets'] countries = [c['key'] for c in countries] data['countries'] = exactitude.countries.normalize_set(countries) languages = collection.languages if languages is None or not len(languages): languages = aggregations['languages']['buckets'] languages = [c['key'] for c in languages] data['languages'] = exactitude.languages.normalize_set(languages) texts.extend([normalize(t, ascii=True) for t in texts]) data['text'] = index_form(texts) data = index_safe(collections_index(), collection.id, data) refresh_index(index=collections_index()) return data
def delete_collection(collection_id): """Delete all documents from a particular collection.""" q = {'ids': {'values': str(collection_id)}} query_delete(collections_index(), q) refresh_index(index=collections_index())
def delete(id): collection = get_db_collection(id, request.authz.WRITE) delete_collection(collection) refresh_index(collections_index()) return ('', 204)
def _type_dispatch(self, type_): if type_ in [Collection]: return collections_index() if type_ in [Document, Entity]: return entities_index() return type_
def query_collections(): q = {'query': {'match_all': {}}, 'size': 9999} res = es.search(index=collections_index(), body=q) return res
def index_collection(collection, sync=False): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = { 'foreign_id': collection.foreign_id, 'created_at': collection.created_at, 'updated_at': collection.updated_at, 'label': collection.label, 'kind': collection.kind, 'summary': collection.summary, 'category': Collection.DEFAULT, 'publisher': collection.publisher, 'publisher_url': collection.publisher_url, 'info_url': collection.info_url, 'data_url': collection.data_url, 'casefile': collection.casefile, 'secret': collection.secret, 'collection_id': collection.id, 'schemata': {}, 'team': [] } texts = [v for v in data.values() if isinstance(v, str)] if collection.category in Collection.CATEGORIES: data['category'] = collection.category if collection.creator is not None: data['creator'] = { 'id': collection.creator.id, 'type': collection.creator.type, 'name': collection.creator.name } texts.append(collection.creator.name) for role in collection.team: data['team'].append({ 'id': role.id, 'type': role.type, 'name': role.name }) texts.append(role.name) stats = get_collection_stats(collection.id) data['count'] = stats['count'] # expose entities by schema count. thing = model.get(Entity.THING) for schema, count in stats['schemata'].items(): schema = model.get(schema) if schema is not None and schema.is_a(thing): data['schemata'][schema.name] = count # if no countries or langs are given, take the most common from the data. countries = ensure_list(collection.countries) countries = countries or stats['countries'].keys() data['countries'] = registry.country.normalize_set(countries) languages = ensure_list(collection.languages) languages = languages or stats['languages'].keys() data['languages'] = registry.language.normalize_set(languages) texts.extend([normalize(t, ascii=True) for t in texts]) data['text'] = index_form(texts) return index_safe(collections_index(), collection.id, data, refresh=sync)