Пример #1
0
def suggest_entities(args):
    """Auto-complete API."""
    text = args.get('prefix')
    min_count = int(args.get('min_count', 0))
    options = []
    if text is not None and len(text.strip()):
        q = {
            'bool': {
                'must': [
                    {'match_phrase_prefix': {'terms': text.strip()}},
                    {'range': {'doc_count': {'gte': min_count}}}
                ]
            }
        }
        q = {
            'size': 5,
            'sort': [{'doc_count': 'desc'}, '_score'],
            'query': authz_collections_filter(q),
            '_source': ['name', '$schema', 'terms', 'doc_count']
        }
        ref = latinize_text(text)
        result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY,
                                 body=q)
        for res in result.get('hits', {}).get('hits', []):
            ent = res.get('_source')
            terms = [latinize_text(t) for t in ent.pop('terms', [])]
            ent['match'] = ref in terms
            ent['id'] = res.get('_id')
            options.append(ent)
    return {
        'text': text,
        'results': options
    }
Пример #2
0
def suggest_entities(prefix, min_count=0, schemas=None, size=5):
    """Auto-complete API."""
    options = []
    if prefix is not None and len(prefix.strip()):
        q = {
            'match_phrase_prefix': {'terms': prefix.strip()}
        }
        if min_count > 0:
            q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}})
        if schemas is not None and len(schemas):
            q = add_filter(q, {'terms': {'$schema': schemas}})
        q = {
            'size': size,
            'sort': [{'doc_count': 'desc'}, '_score'],
            'query': authz_filter(q),
            '_source': ['name', '$schema', 'terms', 'doc_count']
        }
        ref = latinize_text(prefix)
        result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY,
                                 body=q)
        for res in result.get('hits', {}).get('hits', []):
            ent = res.get('_source')
            terms = [latinize_text(t) for t in ent.pop('terms', [])]
            ent['match'] = ref in terms
            ent['score'] = res.get('_score')
            ent['id'] = res.get('_id')
            options.append(ent)
    return {
        'prefix': prefix,
        'results': options
    }
Пример #3
0
def suggest_entities(prefix, authz, min_count=0, schemas=None, size=5):
    """Auto-complete API."""
    options = []
    if prefix is not None and len(prefix.strip()):
        q = {
            'match_phrase_prefix': {'name': prefix.strip()}
        }
        if min_count > 0:
            q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}})
        if schemas is not None and len(schemas):
            q = add_filter(q, {'terms': {'$schema': schemas}})

        # TODO: is this correct? should we allow filter by dataset entities?
        q = add_filter(q, {'terms': {'collection_id': authz.collections_read}})

        q = {
            'size': size,
            'sort': [{'doc_count': 'desc'}, '_score'],
            'query': q,
            '_source': ['name', 'schema', 'fingerprints', 'doc_count']
        }
        ref = latinize_text(prefix)
        result = es.search(index=es_index, doc_type=TYPE_ENTITY, body=q)
        for res in result.get('hits', {}).get('hits', []):
            ent = res.get('_source')
            terms = [latinize_text(t) for t in ent.pop('fingerprints', [])]
            ent['match'] = ref in terms
            ent['score'] = res.get('_score')
            ent['id'] = res.get('_id')
            options.append(ent)
    return {
        'prefix': prefix,
        'results': options
    }
Пример #4
0
def index_document(document, index_records=True):
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['text'] = get_text(document)
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id)

    if index_records:
        clear_records(document.id)
        bulk_op(generate_records(document))
Пример #5
0
def index_document(document, index_records=True):
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    if index_records:
        clear_records(document.id)
        bulk_op(generate_records(document))
Пример #6
0
def index_entity(entity):
    """Index an entity."""
    data = entity.to_dict()
    data.pop('id', None)
    data['doc_count'] = get_count(entity)
    data['terms'] = entity.terms
    data['terms_latin'] = [latinize_text(t) for t in entity.terms]
    data['name_latin'] = latinize_text(data.get('name'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    data['description_latin'] = latinize_text(data.get('description'))
    data = expand_json(data)
    get_es().index(index=get_es_index(), doc_type=TYPE_ENTITY,
                   id=entity.id, body=data)
Пример #7
0
def generate_records(document):
    """Generate index records, based on document rows or pages."""
    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            tid = sha1(str(document.id))
            tid.update(str(page.id))
            tid = tid.hexdigest()

            text = string_value(page.text)
            latin = latinize_text(text)

            yield {
                '_id': tid,
                '_type': TYPE_RECORD,
                '_index': six.text_type(es_index),
                '_parent': document.id,
                '_source': {
                    'type': 'page',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_id,
                    'page': page.number,
                    'text': text,
                    'text_latin': latin
                }
            }
    elif document.type == Document.TYPE_TABULAR:
        for record in document.records:
            data = {k: string_value(v) for (k, v) in record.data.items()}

            text = [v for v in data.values() if v is not None]
            latin = [latinize_text(t) for t in text]
            latin = [t for t in latin if t not in text]

            yield {
                '_id': record.tid,
                '_type': TYPE_RECORD,
                '_index': six.text_type(es_index),
                '_parent': document.id,
                '_source': {
                    'type': 'row',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_id,
                    'row_id': record.row_id,
                    'sheet': record.sheet,
                    'text': text,
                    'text_latin': latin,
                    'raw': data
                }
            }
Пример #8
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    clear_records(document)
    bulk_op(generate_records(document))
Пример #9
0
def get_text(document):
    """Generate an array with the full text of the given document.

    This will limit document length to TEXT_MAX_LEN in order to avoid
    uploading extremely long documents.
    """
    texts = []
    for text in document.text_parts():
        text = string_value(text)
        texts.append(text)
        latin = latinize_text(text)
        if latin != text:
            texts.append(latin)

        text_len = sum((len(t) for t in texts))
        # First, try getting rid of duplicate entries, which are more likely in
        # tabular documents. If that does not help, partial text will be
        # returned.
        if text_len >= TEXT_MAX_LEN:
            texts = list(set(texts))

            text_len = sum((len(t) for t in texts))
            if text_len >= TEXT_MAX_LEN:
                return texts

    return texts
Пример #10
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(),
                   doc_type=TYPE_DOCUMENT,
                   body=data,
                   id=document.id)

    clear_records(document)
    bulk_op(generate_records(document))
Пример #11
0
def generate_records(document):
    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            tid = sha1(str(document.id))
            tid.update(str(page.id))
            tid = tid.hexdigest()
            yield {
                '_id': tid,
                '_type': TYPE_RECORD,
                '_index': get_es_index(),
                '_parent': document.id,
                '_source': {
                    'type': 'page',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_ids,
                    'page': page.number,
                    'text': page.text,
                    'text_latin': latinize_text(page.text)
                }
            }
    elif document.type == Document.TYPE_TABULAR:
        for record in document.records:
            text = record.text
            latin = [latinize_text(t) for t in text]
            latin = [t for t in latin if t not in text]
            yield {
                '_id': record.tid,
                '_type': TYPE_RECORD,
                '_index': get_es_index(),
                '_parent': document.id,
                '_source': {
                    'type': 'row',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_ids,
                    'row_id': record.row_id,
                    'sheet': record.sheet,
                    'text': text,
                    'text_latin': latin,
                    'raw': record.data
                }
            }
Пример #12
0
def generate_records(document):
    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            tid = sha1(str(document.id))
            tid.update(str(page.id))
            tid = tid.hexdigest()
            yield {
                '_id': tid,
                '_type': TYPE_RECORD,
                '_index': get_es_index(),
                '_parent': document.id,
                '_source': {
                    'type': 'page',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_ids,
                    'page': page.number,
                    'text': page.text,
                    'text_latin': latinize_text(page.text)
                }
            }
    elif document.type == Document.TYPE_TABULAR:
        for record in document.records:
            text = record.text
            latin = [latinize_text(t) for t in text]
            latin = [t for t in latin if t not in text]
            yield {
                '_id': record.tid,
                '_type': TYPE_RECORD,
                '_index': get_es_index(),
                '_parent': document.id,
                '_source': {
                    'type': 'row',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'collection_id': document.collection_ids,
                    'row_id': record.row_id,
                    'sheet': record.sheet,
                    'text': text,
                    'text_latin': latin,
                    'raw': record.data
                }
            }
Пример #13
0
def pending(id):
    collection = obj_or_404(Collection.by_id(id))
    authz.require(authz.collection_read(id))
    q = collection.pending_entities()
    q = q.limit(30)
    entities = []
    for entity in q.all():
        data = entity.to_dict()
        data['name_latin'] = latinize_text(entity.name, lowercase=False)
        entities.append(data)
    return jsonify({'results': entities, 'total': len(entities)})
Пример #14
0
def pending(id):
    collection = obj_or_404(Collection.by_id(id))
    request.authz.require(request.authz.collection_read(collection))
    q = collection.pending_entities()
    q = q.limit(30)
    entities = []
    for entity in q.all():
        data = entity.to_dict()
        data['name_latin'] = latinize_text(entity.name)
        entities.append(data)
    return jsonify({'results': entities, 'total': len(entities)})
Пример #15
0
def similar_entities(entity, args, collections):
    """Merge suggestions API."""
    shoulds = []
    for term in entity.terms:
        shoulds.append({
            'multi_match': {
                "fields": ["name^50", "terms^25", "summary^5"],
                "query": term,
                "fuzziness": 2
            }
        })
        shoulds.append({
            'multi_match': {
                "fields": ["name_latin^10", "terms_latin^5", "summary_latin"],
                "query": latinize_text(term),
                "fuzziness": 2
            }
        })

    q = {
        "bool": {
            "should": shoulds,
            "must_not": {
                "ids": {
                    "values": [entity.id]
                }
            },
            "must": {
                "terms": {
                    "collection_id": collections
                }
            },
            "minimum_should_match": 1
        }
    }
    q = {
        'size': 10,
        'query': authz_filter(q),
        '_source': DEFAULT_FIELDS
    }
    options = []
    result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY,
                             body=q)
    for res in result.get('hits', {}).get('hits', []):
        entity = res.get('_source')
        entity['id'] = res.get('_id')
        entity['score'] = res.get('_score')
        entity['api_url'] = url_for('entities_api.view', id=res.get('_id'))
        options.append(entity)
    return {
        'results': options
    }
Пример #16
0
def text_query_string(text, literal=False):
    if text is None or not len(text.strip()):
        return match_all()
    if literal:
        text = '"%s"' % latinize_text(text)
    return {
        'query_string': {
            'query': text,
            'fields': ['text^6', 'text_latin^2'],
            'default_operator': 'AND',
            'use_dis_max': True
        }
    }
Пример #17
0
def index_document(document_id, index_records=True):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(),
                   doc_type=TYPE_DOCUMENT,
                   body=data,
                   id=document.id)

    if index_records:
        clear_records(document)
        bulk(get_es(),
             generate_records(document),
             stats_only=True,
             chunk_size=1000,
             request_timeout=500.0)
Пример #18
0
def pending():
    q = db.session.query(Entity)
    q = q.filter(Entity.state == Entity.STATE_PENDING)
    clause = Collection.id.in_(authz.collections(authz.READ))
    q = q.filter(Entity.collections.any(clause))
    ref = aliased(Reference)
    q = q.join(ref)
    q = q.group_by(Entity)
    q = q.order_by(func.sum(ref.weight).desc())
    entity = q.first()
    if entity is None:
        return jsonify({'empty': True})
    data = entity.to_dict()
    data['name_latin'] = latinize_text(data['name'], lowercase=False)
    return jsonify(data)
Пример #19
0
def meta_query_string(text, literal=False):
    if text is None or not len(text.strip()):
        return match_all()
    if literal:
        text = '"%s"' % latinize_text(text)
    return {
        "query_string": {
            "query": text,
            "fields": ['title^15', 'file_name',
                       'summary^10', 'title_latin^12',
                       'summary_latin^8'],
            "default_operator": "AND",
            "use_dis_max": True
        }
    }
Пример #20
0
def pending():
    q = db.session.query(Entity)
    q = q.filter(Entity.state == Entity.STATE_PENDING)
    clause = Collection.id.in_(authz.collections(authz.READ))
    q = q.filter(Entity.collections.any(clause))
    ref = aliased(Reference)
    q = q.join(ref)
    q = q.group_by(Entity)
    q = q.order_by(func.sum(ref.weight).desc())
    entity = q.first()
    if entity is None:
        return jsonify({'empty': True})
    data = entity.to_dict()
    data['name_latin'] = latinize_text(data['name'], lowercase=False)
    return jsonify(data)
Пример #21
0
def similar_entities(entity, args):
    """Merge suggestions API."""
    shoulds = []
    for term in entity.terms:
        shoulds.append({
            'multi_match': {
                "fields": ["name^50", "terms^25", "summary^5"],
                "query": term,
                "fuzziness": 2
            }
        })
        shoulds.append({
            'multi_match': {
                "fields": ["name_latin^10", "terms_latin^5", "summary_latin"],
                "query": latinize_text(term),
                "fuzziness": 2
            }
        })

    q = {
        "bool": {
            "should": shoulds,
            "must_not": {
                "ids": {
                    "values": [entity.id]
                }
            },
            "minimum_should_match": 1
        }
    }
    q = {
        'size': 10,
        'query': authz_collections_filter(q),
        '_source': DEFAULT_FIELDS
    }
    options = []
    result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY,
                             body=q)
    for res in result.get('hits', {}).get('hits', []):
        entity = res.get('_source')
        entity['id'] = res.get('_id')
        entity['score'] = res.get('_score')
        entity['api_url'] = url_for('entities_api.view', id=res.get('_id'))
        options.append(entity)
    return {
        'results': options
    }
Пример #22
0
def pending():
    q = db.session.query(Entity)
    skip_entities = request.args.getlist('skip')
    if len(skip_entities):
        q = q.filter(not_(Entity.id.in_(skip_entities)))
    q = q.filter(Entity.state == Entity.STATE_PENDING)
    clause = Collection.id.in_(authz.collections(authz.READ))
    q = q.filter(Entity.collections.any(clause))
    # this was too slow to actually work:
    # ref = aliased(Reference)
    # q = q.join(ref)
    # q = q.group_by(Entity)
    # q = q.order_by(func.count(ref.id).desc())
    q = q.order_by(func.random())
    q = q.limit(30)
    entities = []
    for entity in q.all():
        data = entity.to_dict()
        data['name_latin'] = latinize_text(entity.name, lowercase=False)
        entities.append(data)
    return jsonify({'results': entities, 'total': len(entities)})
Пример #23
0
 def normalize_value(self, value):
     value = collapse_spaces(value)
     return value, latinize_text(value)
Пример #24
0
def finalize_index(data, schema):
    """Apply final denormalisations to the index."""
    properties = data.get('properties', {})

    text = set()
    for vs in properties.values():
        for v in ensure_list(vs):
            v = string_value(v)
            if v is None or len(v) < 2:
                continue
            v = v.strip()
            text.add(v)
            v = latinize_text(v)
            text.add(v)
            # v = category_replace(v)
            # text.add(v)

    data['text'] = list(text)
    data['fingerprints'] = data.get('fingerprints', [])

    # Generate inverted representations of the data stored in properties.
    for prop in schema.properties:
        values = properties.get(prop.name, [])
        if not len(values):
            continue

        # Find an set the name property
        if prop.is_label:
            data['name'] = values[0]

        # Generate key material
        # TODO: this should probably be record-based.
        data['fingerprints'].extend(prop.type.fingerprint(values))

        # Add inverted properties. This takes all the properties
        # of a specific type (names, dates, emails etc.)
        invert = prop.type.index_invert
        if invert:
            if invert not in data:
                data[invert] = []
            for norm in prop.type.normalize(values):
                if norm not in data[invert]:
                    data[invert].append(norm)

    data['fingerprints'] = list(set(data['fingerprints']))

    # Add latinised names
    names = data.get('names', [])
    for name in list(names):
        names.append(latinize_text(name))
    data['names'] = list(set(names))

    # Get implied schemata (i.e. parents of the actual schema)
    data['schema'] = schema.name
    data['schemata'] = []
    for parent in schema.schemata:
        if not parent.hidden:
            data['schemata'].append(parent.name)

    # Second name field for non-tokenised sorting.
    if 'name' in data:
        data['name_sort'] = data.get('name')
    return data