def records_query(document_id, args, size=5): terms = [] text = args.get('q', '').strip() if len(text): terms.append(text) entities = Entity.by_id_set(args.getlist('entity')) for entity in entities.values(): terms.extend(entity.terms) if not len(terms): return None shoulds = [] for term in terms: shoulds.append({ 'match': { 'text': { 'query': term, 'boost': 10, 'operator': 'and' } } }) shoulds.append({ 'match': { 'text_latin': { 'query': latinize_text(term), 'operator': 'and' } } }) q = { 'bool': { 'minimum_should_match': 1, 'should': shoulds } } if document_id is not None: q['bool']['must'] = { 'term': {'document_id': document_id} } try: snippet = int(args.get('snippet', 150)) except: snippet = 150 return { 'size': size, 'query': q, 'highlight': { 'fields': { 'text': {'fragment_size': snippet}, 'text_latin': {'fragment_size': snippet} } }, '_source': ['document_id', 'sheet', 'row_id', 'page'] }
def transform_facets(aggregations): coll = aggregations.get('all', {}).get('ftr', {}).get('collections', {}) coll = coll.get('buckets', []) lists = {} for list_id in get_list_facets(request.args): key = 'list_%s' % list_id ents = aggregations.get(key, {}).get('inner', {}) ents = ents.get('entities', {}).get('buckets', []) objs = Entity.by_id_set([e.get('key') for e in ents]) entities = [] for entity in ents: entity['entity'] = objs.get(entity.get('key')) if entity['entity'] is not None: entities.append(entity) lists[list_id] = entities attributes = {} for attr in request.args.getlist('attributefacet'): key = 'attr_%s' % attr vals = aggregations.get(key, {}).get('inner', {}) vals = vals.get('values', {}).get('buckets', []) attributes[attr] = vals return { 'sources': coll, 'lists': lists, 'attributes': attributes }
def format_results(query): sources = {} entities = {} results = [] for row in raw_iter(query): src = row.get('_source') data = {} for name, value in src.items(): if isinstance(value, dict) or name in SKIP_FIELDS: continue if name == 'entities': load_ids = [] for entity_id in value: if entity_id not in entities: load_ids.append(entity_id) if len(load_ids): for id, ent in Entity.by_id_set(load_ids).items(): entities[id] = ent.name value = ', '.join([entities.get(e) for e in value if entities.get(e) is not None]) if isinstance(value, (list, tuple, set)): value = ', '.join(value) if name == 'source_id': # WARNING: don't to one query per row if value not in sources: source = Source.by_id(value) if source is None: sources[value] = '[Deleted source %s]' % value else: sources[value] = source.label value = sources[value] data[name] = value results.append(data) return results
def transform_facets(aggregations): coll = aggregations.get('all', {}).get('ftr', {}).get('collections', {}) coll = coll.get('buckets', []) lists = {} for list_id in get_list_facets(request.args): key = 'list_%s' % list_id ents = aggregations.get(key, {}).get('inner', {}) ents = ents.get('entities', {}).get('buckets', []) objs = Entity.by_id_set([e.get('key') for e in ents]) entities = [] for entity in ents: entity['entity'] = objs.get(entity.get('key')) if entity['entity'] is not None: entities.append(entity) lists[list_id] = entities attributes = {} for attr in request.args.getlist('attributefacet'): key = 'attr_%s' % attr vals = aggregations.get(key, {}).get('inner', {}) vals = vals.get('values', {}).get('buckets', []) attributes[attr] = vals return {'sources': coll, 'lists': lists, 'attributes': attributes}
def records_query(document_id, args, size=5, snippet_size=100): shoulds = [] text = args.get('q', '').strip() if len(text): shoulds.append(text_query_string(text)) entities = Entity.by_id_set(args.getlist('entity')) for entity in entities.values(): for term in entity.terms: shoulds.append({ 'multi_match': { 'query': term, 'type': "best_fields", 'fields': ['text^5', 'text_latin'], 'operator': 'AND' } }) if not len(shoulds): return None q = { 'bool': { 'minimum_should_match': 1, 'should': shoulds } } if document_id is not None: q['bool']['must'] = { 'term': {'document_id': document_id} } try: snippet_size = int(args.get('snippet', snippet_size)) except: pass return { 'size': size, 'query': q, 'highlight': { 'fields': { 'text': { 'fragment_size': snippet_size, 'number_of_fragments': 1 }, 'text_latin': { 'fragment_size': snippet_size, 'number_of_fragments': 1 } } }, '_source': ['document_id', 'sheet', 'row_id', 'page'] }
def convert_entities(entities): results = [] buckets = entities.get('buckets', []) entities = Entity.by_id_set([e.get('key') for e in buckets]) for bucket in buckets: entity = entities.get(bucket.get('key')) if entity is None: continue data = entity.to_ref() data['count'] = bucket.get('doc_count') results.append(data) return {'values': results}
def convert_entities(entities): results = [] buckets = entities.get('buckets', []) entities = Entity.by_id_set([e.get('key') for e in buckets]) for bucket in buckets: entity = entities.get(bucket.get('key')) if entity is None: continue results.append({ 'id': entity.id, 'name': entity.name, '$schema': entity.type, 'count': bucket.get('doc_count') }) return results
def process_row(row, attributes): src = row.get('_source') data = {} for name in attributes: value = src.get(name) for attr in src.get('attributes', []): if attr.get('name') == name: value = attr.get('value') if name == 'entities': objs = Entity.by_id_set([e.get('id') for e in value]) value = ', '.join([o.label for o in objs.values()]) if name == 'collection': # WARNING: don't to one query per row value = unicode(Source.by_slug(value) or value) data[name] = value return data
def records_query(document_id, args, size=5, snippet_size=100): shoulds = [] text = args.get('q', '').strip() if len(text): shoulds.append(text_query_string(text)) entities = Entity.by_id_set(args.getlist('entity')) for entity in entities.values(): for term in entity.terms: shoulds.append({ 'multi_match': { 'query': term, 'type': "best_fields", 'fields': ['text^5', 'text_latin'], 'operator': 'AND' } }) if not len(shoulds): return None q = {'bool': {'minimum_should_match': 1, 'should': shoulds}} if document_id is not None: q['bool']['must'] = {'term': {'document_id': document_id}} try: snippet_size = int(args.get('snippet', snippet_size)) except: pass return { 'size': size, 'query': q, 'highlight': { 'fields': { 'text': { 'fragment_size': snippet_size, 'number_of_fragments': 1 }, 'text_latin': { 'fragment_size': snippet_size, 'number_of_fragments': 1 } } }, '_source': ['document_id', 'sheet', 'row_id', 'page'] }
def records_query_shoulds(args): shoulds = [] query_text = args.get('q', '').strip() if len(query_text): shoulds.append(text_query_string(query_text)) entities = Entity.by_id_set(args.getlist('entity')) for entity in entities.values(): for term in entity.terms: shoulds.append({ 'multi_match': { 'query': term, 'type': "best_fields", 'fields': ['text^5', 'text_latin'], 'operator': 'AND' } }) return shoulds
def records_query_shoulds(args): shoulds = [] query_text = args.get("q", "").strip() if len(query_text): shoulds.append(text_query_string(query_text)) entities = Entity.by_id_set(args.getlist("entity")) for entity in entities.values(): for term in entity.terms: shoulds.append( { "multi_match": { "query": term, "type": "best_fields", "fields": ["text^5", "text_latin"], "operator": "AND", } } ) return shoulds
def entities(self): if not hasattr(self, '_entities'): cs = self.authz.collections_read ids = self.getlist('filter:entities.id') self._entities = Entity.by_id_set(ids, collections=cs) return self._entities
def expand(self, keys): entities = {k: {'label': None} for k in keys} for entity in Entity.by_id_set(keys).values(): entities[entity.id] = entity.to_ref() return entities