Exemplo n.º 1
0
def index_entity(entity):
    """Index an entity."""
    if entity.deleted_at is not None:
        return delete_entity(entity.id)

    data = {
        'foreign_ids': entity.foreign_ids,
        'data': entity.data,
        'created_at': entity.created_at,
        'updated_at': entity.updated_at,
        'bulk': False,
        'roles': entity.collection.roles,
        'collection_id': entity.collection_id,
        'properties': {
            'name': [entity.name]
        }
    }

    for k, v in entity.data.items():
        data['properties'][k] = ensure_list(v)

    # data['$documents'] = get_count(entity)
    data = finalize_index(data, entity.schema)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             id=entity.id,
             body=data)
    data['id'] = entity.id
    return data
Exemplo n.º 2
0
    def __init__(self, request, query, parser=None, schema=None):
        super(MatchQueryResult, self).__init__(request,
                                               query,
                                               parser=parser,
                                               schema=schema)
        ids = set()
        for match in self.results:
            ids.add(match.match_id)
            ids.add(match.entity_id)
        ids = {'ids': list(ids)}

        result = es.mget(index=entities_index(),
                         doc_type=entity_type(),
                         body=ids)
        for doc in result.get('docs', []):
            entity = unpack_result(doc)
            if entity is None:
                continue
            for match in self.results:
                if match.match_id == entity['id']:
                    match.match = entity
                if match.entity_id == entity['id']:
                    match.entity = entity

        # Do not return results if the entity has been removed in the mean
        # time. Not sure this is the ideal way of doing this, as it'll mess
        # with pagination counts etc.
        for match in list(self.results):
            if not hasattr(match, 'match') or not hasattr(match, 'entity'):
                self.results.remove(match)
Exemplo n.º 3
0
def get_instance_stats(authz):
    query = {
        'size': 0,
        'query': {
            'terms': {
                'roles': list(authz.roles)
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            }
        }
    }
    result = es.search(index=entities_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'count': result.get('hits').get('total'), 'schemata': {}}
    for schema in aggregations.get('schema').get('buckets'):
        key = schema.get('key')
        data['schemata'][key] = schema.get('doc_count')

    return data
Exemplo n.º 4
0
def _index_updates(collection, entities):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    common = {
        'collection_id': collection.id,
        'bulk': True,
        'roles': collection.roles,
        'updated_at': datetime.utcnow()
    }
    if not len(entities):
        return

    result = es.mget(index=entities_index(),
                     doc_type=entity_type(),
                     body={'ids': entities.keys()},
                     _source=['schema', 'properties', 'created_at'])
    for doc in result.get('docs', []):
        if not doc.get('found', False):
            continue
        entity_id = doc['_id']
        entity = entities.get(entity_id)
        existing = doc.get('_source')
        combined = model.merge(existing, entity)
        combined['created_at'] = existing.get('created_at')
        entities[entity_id] = combined

    for doc_id, entity in entities.items():
        entity.pop('id', None)
        entity.pop('data', None)
        entity.update(common)
        if 'created_at' not in entity:
            entity['created_at'] = entity.get('updated_at')
        schema = model.get(entity.get('schema'))
        entity = finalize_index(entity, schema)
        # pprint(entity)
        yield {
            '_id': doc_id,
            '_index': entity_index(),
            '_type': entity_type(),
            '_source': entity
        }
Exemplo n.º 5
0
def get_document(document_id):
    """Fetch a document from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=document_id,
                    ignore=[404])
    document = unpack_result(result)
    if document is not None:
        document.pop('text', None)
    return document
Exemplo n.º 6
0
def get_entity(entity_id):
    """Fetch an entity from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=entity_id,
                    ignore=[404])
    entity = unpack_result(result)
    if entity is not None:
        entity.pop('text', None)
    return entity
Exemplo n.º 7
0
def upgrade_search():
    """Add any missing properties to the index mappings."""
    INDEXES = [
        (collection_index(), collection_type(), COLLECTION_MAPPING),
        (entity_index(), entity_type(), ENTITY_MAPPING),
        (record_index(), record_type(), RECORD_MAPPING),
    ]
    for (index, doc_type, mapping) in INDEXES:
        log.info("Creating index: %s (%s)", index, doc_type)
        es.indices.create(index, ignore=[404, 400])
        es.indices.put_mapping(index=index, doc_type=doc_type, body=mapping)
        es.indices.open(index=index, ignore=[400, 404])
        es.indices.refresh(index=index)
Exemplo n.º 8
0
def get_collection_stats(collection_id):
    """Compute some statistics on the content of a collection."""
    query = {
        'size': 0,
        'query': {
            'term': {
                'collection_id': collection_id
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            },
            'countries': {
                'terms': {
                    'field': 'countries',
                    'size': 250
                }
            },
            'languages': {
                'terms': {
                    'field': 'languages',
                    'size': 100
                }
            },
        }
    }
    result = es.search(index=entity_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'schemata': {}, 'count': result['hits']['total']}

    # expose entities by schema count.
    for schema in aggregations['schema']['buckets']:
        data['schemata'][schema['key']] = schema['doc_count']

    # if no countries or langs are given, take the most common from the data.
    if not len(data.get('countries', [])):
        countries = aggregations['countries']['buckets']
        data['countries'] = [c['key'] for c in countries]

    if not len(data.get('languages', [])):
        countries = aggregations['languages']['buckets']
        data['languages'] = [c['key'] for c in countries]

    # pprint(data)
    return data
Exemplo n.º 9
0
def update_roles(collection):
    """Update the role visibility of objects which are part of collections."""
    roles = ', '.join([str(r) for r in collection.roles])
    body = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        'script': {
            'inline': 'ctx._source.roles = [%s]' % roles
        }
    }
    es.update_by_query(index=entity_index(),
                       doc_type=entity_type(),
                       body=body,
                       wait_for_completion=False)
Exemplo n.º 10
0
def xref_collection(collection, other=None):
    """Cross-reference all the entities and documents in a collection."""
    log.info("Cross-reference collection: %r", collection)
    other_id = other.id if other is not None else None
    query = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        '_source': FIELDS_XREF
    }
    scanner = scan(es,
                   index=entities_index(),
                   doc_type=entity_type(),
                   query=query,
                   scroll='15m',
                   size=1000)

    for i, res in enumerate(scanner):
        # xref_item.delay(unpack_result(res), other_id)
        xref_item(unpack_result(res), other_id)
Exemplo n.º 11
0
def index_document(document):
    if document.status == Document.STATUS_PENDING:
        return

    log.info("Index document [%s]: %s", document.id, document.title)
    schema = model.get(Document.SCHEMA)
    data = {
        'schema': schema.name,
        'schemata': schema.names,
        'collection_id': document.collection_id,
        'roles': document.collection.roles,
        'type': document.type,
        'status': document.status,
        'content_hash': document.content_hash,
        'foreign_id': document.foreign_id,
        'error_message': document.error_message,
        'uploader_id': document.uploader_id,
        'created_at': document.created_at,
        'updated_at': document.updated_at,
        'title': document.title,
        'name': document.title,
        'summary': document.summary,
        'author': document.author,
        'file_size': document.file_size,
        'file_name': document.file_title,
        'source_url': document.source_url,
        'languages': document.languages,
        'countries': document.countries,
        'keywords': document.keywords,
        'dates': document.dates,
        'extension': document.extension,
        'encoding': document.encoding,
        'mime_type': document.mime_type,
        'pdf_version': document.pdf_version,
        'columns': document.columns,
        'children': document.children.count(),
        'text': index_form(document.texts)
    }
    if document.parent_id is not None:
        data['parent'] = {
            'id': document.parent_id,
            'type': document.parent.type,
            'title': document.parent.title,
        }

    q = db.session.query(DocumentTag)
    q = q.filter(DocumentTag.document_id == document.id)
    for tag in q.yield_per(5000):
        field = TAG_FIELDS.get(tag.type)
        if field is None:
            log.warning("Cannot index document tag: %r", tag)
            continue
        if field not in data:
            data[field] = []
        data[field].append(tag.text)

    index_names(data)
    data = clean_dict(data)
    # pprint(data)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             body=data,
             id=document.id)
    data['id'] = document.id
    return data
Exemplo n.º 12
0
def delete_document(document_id):
    clear_records(document_id)
    es.delete(index=entities_index(),
              doc_type=entity_type(),
              id=document_id,
              ignore=[404])
Exemplo n.º 13
0
 def get_doc_type(self):
     return entity_type()
Exemplo n.º 14
0
def delete_entity(entity_id):
    """Delete an entity from the index."""
    es.delete(index=entities_index(),
              doc_type=entity_type(),
              id=entity_id,
              ignore=[404])