Exemplo n.º 1
0
def records_query(document_id, args, size=5):
    terms = []
    text = args.get('q', '').strip()
    if len(text):
        terms.append(text)

    entities = Entity.by_id_set(args.getlist('entity'))
    for entity in entities.values():
        terms.extend(entity.terms)

    if not len(terms):
        return None

    shoulds = []
    for term in terms:
        shoulds.append({
            'match': {
                'text': {
                    'query': term,
                    'boost': 10,
                    'operator': 'and'
                }
            }
        })
        shoulds.append({
            'match': {
                'text_latin': {
                    'query': latinize_text(term),
                    'operator': 'and'
                }
            }
        })

    q = {
        'bool': {
            'minimum_should_match': 1,
            'should': shoulds
        }
    }
    if document_id is not None:
        q['bool']['must'] = {
            'term': {'document_id': document_id}
        }

    try:
        snippet = int(args.get('snippet', 150))
    except:
        snippet = 150

    return {
        'size': size,
        'query': q,
        'highlight': {
            'fields': {
                'text': {'fragment_size': snippet},
                'text_latin': {'fragment_size': snippet}
            }
        },
        '_source': ['document_id', 'sheet', 'row_id', 'page']
    }
Exemplo n.º 2
0
def transform_facets(aggregations):
    coll = aggregations.get('all', {}).get('ftr', {}).get('collections', {})
    coll = coll.get('buckets', [])

    lists = {}
    for list_id in get_list_facets(request.args):
        key = 'list_%s' % list_id
        ents = aggregations.get(key, {}).get('inner', {})
        ents = ents.get('entities', {}).get('buckets', [])
        objs = Entity.by_id_set([e.get('key') for e in ents])
        entities = []
        for entity in ents:
            entity['entity'] = objs.get(entity.get('key'))
            if entity['entity'] is not None:
                entities.append(entity)
        lists[list_id] = entities

    attributes = {}
    for attr in request.args.getlist('attributefacet'):
        key = 'attr_%s' % attr
        vals = aggregations.get(key, {}).get('inner', {})
        vals = vals.get('values', {}).get('buckets', [])
        attributes[attr] = vals

    return {
        'sources': coll,
        'lists': lists,
        'attributes': attributes
    }
Exemplo n.º 3
0
def format_results(query):
    sources = {}
    entities = {}
    results = []
    for row in raw_iter(query):
        src = row.get('_source')
        data = {}
        for name, value in src.items():
            if isinstance(value, dict) or name in SKIP_FIELDS:
                continue
            if name == 'entities':
                load_ids = []
                for entity_id in value:
                    if entity_id not in entities:
                        load_ids.append(entity_id)
                if len(load_ids):
                    for id, ent in Entity.by_id_set(load_ids).items():
                        entities[id] = ent.name

                value = ', '.join([entities.get(e) for e in value
                                   if entities.get(e) is not None])
            if isinstance(value, (list, tuple, set)):
                value = ', '.join(value)
            if name == 'source_id':
                # WARNING: don't to one query per row
                if value not in sources:
                    source = Source.by_id(value)
                    if source is None:
                        sources[value] = '[Deleted source %s]' % value
                    else:
                        sources[value] = source.label
                value = sources[value]
            data[name] = value
            results.append(data)
    return results
Exemplo n.º 4
0
def transform_facets(aggregations):
    coll = aggregations.get('all', {}).get('ftr', {}).get('collections', {})
    coll = coll.get('buckets', [])

    lists = {}
    for list_id in get_list_facets(request.args):
        key = 'list_%s' % list_id
        ents = aggregations.get(key, {}).get('inner', {})
        ents = ents.get('entities', {}).get('buckets', [])
        objs = Entity.by_id_set([e.get('key') for e in ents])
        entities = []
        for entity in ents:
            entity['entity'] = objs.get(entity.get('key'))
            if entity['entity'] is not None:
                entities.append(entity)
        lists[list_id] = entities

    attributes = {}
    for attr in request.args.getlist('attributefacet'):
        key = 'attr_%s' % attr
        vals = aggregations.get(key, {}).get('inner', {})
        vals = vals.get('values', {}).get('buckets', [])
        attributes[attr] = vals

    return {'sources': coll, 'lists': lists, 'attributes': attributes}
Exemplo n.º 5
0
Arquivo: records.py Projeto: 01-/aleph
def records_query(document_id, args, size=5, snippet_size=100):
    shoulds = []
    text = args.get('q', '').strip()
    if len(text):
        shoulds.append(text_query_string(text))

    entities = Entity.by_id_set(args.getlist('entity'))
    for entity in entities.values():
        for term in entity.terms:
            shoulds.append({
                'multi_match': {
                    'query': term,
                    'type': "best_fields",
                    'fields': ['text^5', 'text_latin'],
                    'operator': 'AND'
                }
            })

    if not len(shoulds):
        return None

    q = {
        'bool': {
            'minimum_should_match': 1,
            'should': shoulds
        }
    }
    if document_id is not None:
        q['bool']['must'] = {
            'term': {'document_id': document_id}
        }

    try:
        snippet_size = int(args.get('snippet', snippet_size))
    except:
        pass

    return {
        'size': size,
        'query': q,
        'highlight': {
            'fields': {
                'text': {
                    'fragment_size': snippet_size,
                    'number_of_fragments': 1
                },
                'text_latin': {
                    'fragment_size': snippet_size,
                    'number_of_fragments': 1
                }
            }
        },
        '_source': ['document_id', 'sheet', 'row_id', 'page']
    }
Exemplo n.º 6
0
def convert_entities(entities):
    results = []
    buckets = entities.get('buckets', [])
    entities = Entity.by_id_set([e.get('key') for e in buckets])
    for bucket in buckets:
        entity = entities.get(bucket.get('key'))
        if entity is None:
            continue
        data = entity.to_ref()
        data['count'] = bucket.get('doc_count')
        results.append(data)
    return {'values': results}
Exemplo n.º 7
0
def convert_entities(entities):
    results = []
    buckets = entities.get('buckets', [])
    entities = Entity.by_id_set([e.get('key') for e in buckets])
    for bucket in buckets:
        entity = entities.get(bucket.get('key'))
        if entity is None:
            continue
        data = entity.to_ref()
        data['count'] = bucket.get('doc_count')
        results.append(data)
    return {'values': results}
Exemplo n.º 8
0
def convert_entities(entities):
    results = []
    buckets = entities.get('buckets', [])
    entities = Entity.by_id_set([e.get('key') for e in buckets])
    for bucket in buckets:
        entity = entities.get(bucket.get('key'))
        if entity is None:
            continue
        results.append({
            'id': entity.id,
            'name': entity.name,
            '$schema': entity.type,
            'count': bucket.get('doc_count')
        })
    return results
Exemplo n.º 9
0
def process_row(row, attributes):
    src = row.get('_source')
    data = {}
    for name in attributes:
        value = src.get(name)
        for attr in src.get('attributes', []):
            if attr.get('name') == name:
                value = attr.get('value')
        if name == 'entities':
            objs = Entity.by_id_set([e.get('id') for e in value])
            value = ', '.join([o.label for o in objs.values()])
        if name == 'collection':
            # WARNING: don't to one query per row
            value = unicode(Source.by_slug(value) or value)
        data[name] = value
    return data
Exemplo n.º 10
0
def process_row(row, attributes):
    src = row.get('_source')
    data = {}
    for name in attributes:
        value = src.get(name)
        for attr in src.get('attributes', []):
            if attr.get('name') == name:
                value = attr.get('value')
        if name == 'entities':
            objs = Entity.by_id_set([e.get('id') for e in value])
            value = ', '.join([o.label for o in objs.values()])
        if name == 'collection':
            # WARNING: don't to one query per row
            value = unicode(Source.by_slug(value) or value)
        data[name] = value
    return data
Exemplo n.º 11
0
def records_query(document_id, args, size=5, snippet_size=100):
    shoulds = []
    text = args.get('q', '').strip()
    if len(text):
        shoulds.append(text_query_string(text))

    entities = Entity.by_id_set(args.getlist('entity'))
    for entity in entities.values():
        for term in entity.terms:
            shoulds.append({
                'multi_match': {
                    'query': term,
                    'type': "best_fields",
                    'fields': ['text^5', 'text_latin'],
                    'operator': 'AND'
                }
            })

    if not len(shoulds):
        return None

    q = {'bool': {'minimum_should_match': 1, 'should': shoulds}}
    if document_id is not None:
        q['bool']['must'] = {'term': {'document_id': document_id}}

    try:
        snippet_size = int(args.get('snippet', snippet_size))
    except:
        pass

    return {
        'size': size,
        'query': q,
        'highlight': {
            'fields': {
                'text': {
                    'fragment_size': snippet_size,
                    'number_of_fragments': 1
                },
                'text_latin': {
                    'fragment_size': snippet_size,
                    'number_of_fragments': 1
                }
            }
        },
        '_source': ['document_id', 'sheet', 'row_id', 'page']
    }
Exemplo n.º 12
0
def records_query_shoulds(args):
    shoulds = []
    query_text = args.get('q', '').strip()
    if len(query_text):
        shoulds.append(text_query_string(query_text))

    entities = Entity.by_id_set(args.getlist('entity'))
    for entity in entities.values():
        for term in entity.terms:
            shoulds.append({
                'multi_match': {
                    'query': term,
                    'type': "best_fields",
                    'fields': ['text^5', 'text_latin'],
                    'operator': 'AND'
                }
            })
    return shoulds
Exemplo n.º 13
0
def records_query_shoulds(args):
    shoulds = []
    query_text = args.get("q", "").strip()
    if len(query_text):
        shoulds.append(text_query_string(query_text))

    entities = Entity.by_id_set(args.getlist("entity"))
    for entity in entities.values():
        for term in entity.terms:
            shoulds.append(
                {
                    "multi_match": {
                        "query": term,
                        "type": "best_fields",
                        "fields": ["text^5", "text_latin"],
                        "operator": "AND",
                    }
                }
            )
    return shoulds
Exemplo n.º 14
0
 def entities(self):
     if not hasattr(self, '_entities'):
         cs = self.authz.collections_read
         ids = self.getlist('filter:entities.id')
         self._entities = Entity.by_id_set(ids, collections=cs)
     return self._entities
Exemplo n.º 15
0
 def expand(self, keys):
     entities = {k: {'label': None} for k in keys}
     for entity in Entity.by_id_set(keys).values():
         entities[entity.id] = entity.to_ref()
     return entities