def get_index(self): # schema = self.parser.getlist('filter:schema') # if len(schema): # return entities_read_index(schema=schema, descendants=False) schemata = self.parser.getlist('filter:schemata') if len(schemata): return entities_read_index(schema=schemata) return entities_read_index()
def iter_entities(authz=None, collection_id=None, schemata=None, includes=None, excludes=None): """Scan all entities matching the given criteria.""" filters = [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) if ensure_list(schemata): filters.append({'terms': {'schemata': ensure_list(schemata)}}) source = {} if ensure_list(includes): source['includes'] = ensure_list(includes) if ensure_list(excludes): source['excludes'] = ensure_list(excludes) query = { 'query': {'bool': {'filter': filters}}, 'sort': ['_doc'], '_source': source } index = entities_read_index(schema=schemata) for res in scan(es, index=index, query=query, scroll='1410m'): entity = unpack_result(res) if entity is not None: yield entity
def delete_entity(entity_id, exclude=None, sync=False): """Delete an entity from the index.""" query = {'query': {'ids': {'values': str(entity_id)}}} es.delete_by_query(index=entities_read_index(exclude=exclude), body=query, wait_for_completion=sync, refresh=refresh_sync(sync))
def expand_group(node): if node.type.group is None or node.value is None: return value = str(node.value) query = { 'query': { 'term': { node.type.group: value } }, '_source': { 'includes': ['schema', 'properties'] } } for res in scan(es, index=entities_read_index(), query=query): entity_id = res.get('_id') source = res.get('_source') properties = source.get('properties') schema = model.get(source.get('schema')) for prop in schema.properties.values(): if prop.type != node.type: continue values = properties.get(prop.name) values = node.type.normalize_set(values) if value not in values: continue if prop.reverse: yield Link(node, prop.reverse, entity_id) else: yield Link(node, prop, entity_id, inverted=True)
def delete_entities(collection_id, schema=None, bulk_only=False): """Delete entities from a collection.""" filters = [{'term': {'collection_id': collection_id}}] if bulk_only: filters.append({'term': {'bulk': True}}) if schema is not None: filters.append({'term': {'schemata': schema.name}}) query = {'bool': {'filter': filters}} query_delete(entities_read_index(schema), query)
def get_collection_stats(collection_id): """Compute some statistics on the content of a collection.""" key = cache.key('cstats', collection_id) data = cache.get_complex(key) if data is not None: return data log.info("Generating collection stats: %s", collection_id) query = { 'size': 0, 'query': { 'bool': { 'filter': [{ 'term': { 'collection_id': collection_id } }] } }, 'aggs': { 'schemata': { 'terms': { 'field': 'schema', 'size': 1000 } }, 'countries': { 'terms': { 'field': 'countries', 'size': 500 } }, 'languages': { 'terms': { 'field': 'languages', 'size': 10 } }, } } result = search_safe(index=entities_read_index(), body=query) aggregations = result.get('aggregations', {}) data = {'count': result['hits']['total']} for facet in ['schemata', 'countries', 'languages']: data[facet] = {} for bucket in aggregations[facet]['buckets']: data[facet][bucket['key']] = bucket['doc_count'] expire = randint(3600 * 3, 3600 * 12) cache.set_complex(key, data, expire=expire) return data
def entity_tags(entity, authz): """Do a search on tags of an entity.""" # NOTE: This must also work for documents. FIELDS = [ 'names', 'emails', 'phones', 'addresses', 'identifiers' ] pivots = [] queries = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for field in FIELDS: for value in entity.get(field, []): if value is None or not len(value): continue queries.append({}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), field_filter_query(field, value) ], 'must_not': [ {'ids': {'values': [entity.get('id')]}}, ] } } }) pivots.append((field, value)) if not len(queries): return res = es.msearch(index=entities_read_index(), body=queries) for (field, value), resp in zip(pivots, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total is not None and total > 0: yield (field, value, total)
def entity_references(entity, authz): """Given a particular entity, find all the references to it from other entities, grouped by the property where they are used.""" schema = model[entity.get('schema')] # Generate all the possible mention locations. properties = [] queries = [] for prop in model.properties: if prop.type != registry.entity: continue if not schema.is_a(prop.range): continue field = 'properties.%s' % prop.name queries.append({}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), {'term': {'schemata': prop.schema.name}}, {'term': {field: entity.get('id')}}, ] } } }) properties.append(prop) if not len(queries): return # Run a count search (with schema facet?) res = es.msearch(index=entities_read_index(), body=queries) for prop, resp in zip(properties, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total is not None and total > 0: yield (prop, total)
def check_alert(alert_id): alert = Alert.by_id(alert_id) if alert is None or alert.role is None: return if not alert.role.is_alertable: return authz = Authz.from_role(alert.role) query = alert_query(alert, authz) result = search_safe(index=entities_read_index(), body=query) for result in result.get('hits').get('hits', []): entity = unpack_result(result) if entity is None: continue log.info('Alert [%s]: %s', alert.query, entity.get('name')) params = {'alert': alert, 'role': alert.role, 'entity': entity} publish(Events.MATCH_ALERT, actor_id=entity.get('uploader_id'), params=params) alert.update() db.session.commit() db.session.close()
def xref_item(proxy): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(proxy) if query == none_query(): return query = { 'query': query, 'size': 100, '_source': { 'includes': ['schema', 'properties', 'collection_id'] } } matchable = list(proxy.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = search_safe(index=index, body=query) results = result.get('hits').get('hits') for result in results: result = unpack_result(result) if result is not None: other = model.get_proxy(result) score = compare(model, proxy, other) yield score, result.get('collection_id'), other
def entities_by_ids(ids, authz=None, cached=True): """Iterate over unpacked entities based on a search for the given entity IDs.""" for i in range(0, len(ids), MAX_PAGE): chunk = ids[i:i + MAX_PAGE] if not len(chunk): return query = bool_query() query['bool']['filter'].append({'ids': {'values': chunk}}) if authz is not None: query['bool']['filter'].append(authz_query(authz)) query = { 'query': query, '_source': {'excludes': ['text']}, 'size': min(MAX_PAGE, len(chunk)) } result = search_safe(index=entities_read_index(), body=query, ignore=[404], request_cache=cached) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: yield entity