def statistics(): enable_cache(vary_user=True) documents = documents_query(QueryState({}, request.authz, limit=0)) entities = entities_query(QueryState({}, request.authz, limit=0)) return jsonify({ 'documents_count': documents.get('total'), 'entities_count': entities.get('total'), 'collections_count': len(request.authz.collections_read) })
def generate_leads(entity_id): """Compute likely duplicates of a given entity and index these leads.""" # Get rid of everything, also for deleted entities etc. delete_entity_leads(entity_id) entity = load_entity(entity_id) if entity is None: # log.warning("[%r] not indexed, skip lead generation.", entity_id) return if not entity.get('collection_id'): # log.warning("[%r] is not in a collecton, skip lead generation.", entity_id) # noqa return log.debug("Generating leads for [%(id)s]: %(name)s", entity) authz = Authz(override=True) judgements = EntityIdentity.judgements_by_entity(entity_id) state = QueryState({}, authz, limit=100) result = similar_entities(entity, state) for other in result.get('results', []): score = entity_distance(entity, other) log.debug(" -[%.2f]-> %s", score, other.get('name')) # TODO: implement some cut-off index_lead({ 'entity_id': entity.get('id'), 'entity_collection_id': entity.get('collection_id'), 'score': score, 'judgement': judgements.get(other.get('id'), 0), 'match_id': other.get('id'), 'schema': other.get('schema'), 'schemata': other.get('schemata'), 'collection_id': other.get('collection_id'), 'dataset': other.get('dataset'), 'roles': other.get('roles') })
def reconcile_op(query): """Reconcile operation for a single query.""" state = QueryState({ 'limit': query.get('limit', '5'), 'strict': 'false' }, request.authz) name = query.get('query', '') entity = { 'id': 'fake', 'names': [name], 'fingerprints': [make_fingerprint(name)], 'schemata': ensure_list(query.get('type')) } for p in query.get('properties', []): entity[p.get('pid')] = ensure_list(p.get('v')) suggested = similar_entities(entity, state) matches = [] for ent in suggested.get('results'): types = [t for t in get_freebase_types() if ent['schema'] == t['id']] matches.append({ 'id': ent.get('id'), 'name': ent.get('name'), 'type': types, 'score': min(100, ent.get('score') * 10), 'uri': entity_link(ent.get('id')), 'match': ent.get('name') == name }) log.info("Reconciled: %r -> %d matches", name, len(matches)) return { 'result': matches, 'num': len(matches) }
def check_role_alerts(authz): alerts = Alert.by_role(authz.role).all() if not len(alerts): return log.info('Alerting %r, %d alerts...', authz.role, len(alerts)) for alert in alerts: args = { 'q': alert.query_text, 'filter:entities.id': alert.entity_id, 'limit': 50 } state = QueryState(args, authz) results = documents_query(state, since=alert.notified_at) if results['total'] == 0: continue log.info('Found %d new results for: %r', results['total'], alert.label) alert.update() try: subject = '%s (%s new results)' % (alert.label, results['total']) html = render_template('email/alert.html', alert=alert, role=authz.role, total=results.get('total'), results=format_results(alert, results), app_title=app_title, app_url=app_url) notify_role(authz.role, subject, html) except Exception as ex: log.exception(ex) db.session.commit()
def similar(id): entity, _ = get_entity(id, request.authz.READ) schema = schemata.get(entity.get('schema')) if not schema.fuzzy: return jsonify({'status': 'ignore', 'results': [], 'total': 0}) state = QueryState(request.args, request.authz) combined = combined_entity(entity) return jsonify(similar_entities(combined, state))
def export(): state = QueryState(request.args, request.authz, limit=0) log_event(request) output = make_excel(get_results(state, 50000), FIELDS) return send_file(output, mimetype=XLSX_MIME, as_attachment=True, attachment_filename='export.xlsx')
def index(): enable_cache(vary_user=True) results = [d for d in datasets if request.authz.check_roles(d.roles)] state = QueryState({ 'filter:dataset': [d.name for d in results], 'facet': 'dataset', 'limit': 0 }, request.authz) res = entities_query(state) values = res.get('facets', {}).get('dataset', {}).get('values', []) counts = {v.get('id'): v.get('count') for v in values} countries_facet = defaultdict(int) category_facet = defaultdict(int) countries_filter = set(request.args.getlist('filter:countries')) category_filter = set(request.args.getlist('filter:category')) filtered = [] for dataset in results: dataset.entities_count = counts.get(dataset.name) if len(category_filter) and dataset.category not in category_filter: continue if len(countries_filter) and \ not len(countries_filter.intersection(dataset.countries)): continue for country in dataset.countries: countries_facet[country] += 1 category_facet[dataset.category] += 1 filtered.append(dataset) filtered = sorted(filtered, key=lambda d: d.entities_count, reverse=True) facets = {'countries': {'values': []}, 'category': {'values': []}} categories = get_config('COLLECTION_CATEGORIES', {}) countries_facet = sorted(countries_facet.items(), key=lambda (k, c): c) for key, count in countries_facet[::-1]: facets['countries']['values'].append({ 'id': key, 'count': count, 'label': COUNTRY_NAMES.get(key, key) }) category_facet = sorted(category_facet.items(), key=lambda (k, c): c) for key, count in category_facet[::-1]: if key is None: continue facets['category']['values'].append({ 'id': key, 'count': count, 'label': categories.get(key, key) }) return jsonify({ 'results': filtered, 'facets': facets, 'total': len(filtered), 'total_entities_count': res.get('total') })
def query(): enable_cache(vary_user=True) state = QueryState(request.args, request.authz) result = documents_query(state) params = next_params(request.args, result) log_event(request) if params is not None: result['next'] = url_for('search_api.query', **params) return jsonify(result)
def peek(): if not get_config('ALLOW_PEEKING', True): return jsonify({'active': False}) enable_cache(vary_user=True) state = QueryState(request.args, request.authz) response = peek_query(state) if not request.authz.logged_in: response.pop('roles', None) return jsonify(response)
def records(document_id): document = get_document(document_id) enable_cache(vary_user=True) state = QueryState(request.args, request.authz) query = records_query(document.id, state) result = execute_records_query(document.id, state, query) params = next_params(request.args, result) if params is not None: result['next'] = url_for('documents_api.records', document_id=document_id, **params) return jsonify(result)
def view(name): enable_cache(vary_user=True) try: dataset = datasets.get(name) except NameError: raise NotFound() request.authz.require(request.authz.check_roles(dataset.roles)) state = QueryState({ 'filter:dataset': dataset.name, 'facet': ['schema', 'countries'], 'limit': 0 }, request.authz) res = entities_query(state) data = dataset.to_dict() data['facets'] = res.get('facets', {}) data['doc_count'] = res.get('total') return jsonify(data)
def index(): # allow to filter for writeable collections only, needed # in some UI scenarios: state = QueryState(request.args, request.authz) permission = request.args.get('permission') if permission not in [request.authz.READ, request.authz.WRITE]: permission = request.authz.READ collections = request.authz.collections[permission] # Other filters for navigation label = request.args.get('label') managed = state.getbool('managed', None) # Include counts (of entities, documents) in list view? counts = state.getbool('counts', False) def converter(colls): return [c.to_dict(counts=counts) for c in colls] facet = [f.lower().strip() for f in request.args.getlist('facet')] q = Collection.find(label=label, countries=state.getfilter('countries'), category=state.getfilter('category'), collection_id=collections, managed=managed) data = Pager(q).to_dict(results_converter=converter) facets = {} if 'countries' in facet: facets['countries'] = { 'values': Collection.facet_by(q, Collection.countries, mapping=COUNTRY_NAMES) } if 'category' in facet: mapping = get_config('COLLECTION_CATEGORIES', {}) facets['category'] = { 'values': Collection.facet_by(q, Collection.category, mapping=mapping) } data['facets'] = facets return jsonify(data)
def documents(id): entity, _ = get_entity(id, request.authz.READ) state = QueryState(request.args, request.authz) combined = combined_entity(entity) return jsonify(entity_documents(combined, state))
def links(id): entity, obj = get_entity(id, request.authz.READ) state = QueryState(request.args, request.authz) return jsonify(links_query(entity, state))
def index(): enable_cache(vary_user=True) state = QueryState(request.args, request.authz) doc_counts = state.getbool('doc_counts') res = entities_query(state, doc_counts=doc_counts) return jsonify(res)
def index(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) request.authz.require(request.authz.collection_read(collection)) state = QueryState(request.args, request.authz) results = leads_query(collection_id, state) return jsonify(results)