Пример #1
0
def index_entity(entity):
    """Index an entity."""
    if entity.deleted_at is not None:
        return delete_entity(entity.id)

    data = {
        'foreign_ids': entity.foreign_ids,
        'data': entity.data,
        'created_at': entity.created_at,
        'updated_at': entity.updated_at,
        'bulk': False,
        'roles': entity.collection.roles,
        'collection_id': entity.collection_id,
        'properties': {
            'name': [entity.name]
        }
    }

    for k, v in entity.data.items():
        data['properties'][k] = ensure_list(v)

    # data['$documents'] = get_count(entity)
    data = finalize_index(data, entity.schema)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             id=entity.id,
             body=data)
    data['id'] = entity.id
    return data
Пример #2
0
    def __init__(self, request, query, parser=None, schema=None):
        super(MatchQueryResult, self).__init__(request,
                                               query,
                                               parser=parser,
                                               schema=schema)
        ids = set()
        for match in self.results:
            ids.add(match.match_id)
            ids.add(match.entity_id)
        ids = {'ids': list(ids)}

        result = es.mget(index=entities_index(),
                         doc_type=entity_type(),
                         body=ids)
        for doc in result.get('docs', []):
            entity = unpack_result(doc)
            if entity is None:
                continue
            for match in self.results:
                if match.match_id == entity['id']:
                    match.match = entity
                if match.entity_id == entity['id']:
                    match.entity = entity

        # Do not return results if the entity has been removed in the mean
        # time. Not sure this is the ideal way of doing this, as it'll mess
        # with pagination counts etc.
        for match in list(self.results):
            if not hasattr(match, 'match') or not hasattr(match, 'entity'):
                self.results.remove(match)
Пример #3
0
def get_instance_stats(authz):
    query = {
        'size': 0,
        'query': {
            'terms': {
                'roles': list(authz.roles)
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            }
        }
    }
    result = es.search(index=entities_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'count': result.get('hits').get('total'), 'schemata': {}}
    for schema in aggregations.get('schema').get('buckets'):
        key = schema.get('key')
        data['schemata'][key] = schema.get('doc_count')

    return data
Пример #4
0
def _index_updates(collection, entities):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    common = {
        'collection_id': collection.id,
        'bulk': True,
        'roles': collection.roles,
        'updated_at': datetime.utcnow()
    }
    if not len(entities):
        return

    result = es.mget(index=entities_index(),
                     doc_type=entity_type(),
                     body={'ids': entities.keys()},
                     _source=['schema', 'properties', 'created_at'])
    for doc in result.get('docs', []):
        if not doc.get('found', False):
            continue
        entity_id = doc['_id']
        entity = entities.get(entity_id)
        existing = doc.get('_source')
        combined = model.merge(existing, entity)
        combined['created_at'] = existing.get('created_at')
        entities[entity_id] = combined

    for doc_id, entity in entities.items():
        entity.pop('id', None)
        entity.pop('data', None)
        entity.update(common)
        if 'created_at' not in entity:
            entity['created_at'] = entity.get('updated_at')
        schema = model.get(entity.get('schema'))
        entity = finalize_index(entity, schema)
        # pprint(entity)
        yield {
            '_id': doc_id,
            '_index': entity_index(),
            '_type': entity_type(),
            '_source': entity
        }
Пример #5
0
def get_document(document_id):
    """Fetch a document from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=document_id,
                    ignore=[404])
    document = unpack_result(result)
    if document is not None:
        document.pop('text', None)
    return document
Пример #6
0
def get_entity(entity_id):
    """Fetch an entity from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=entity_id,
                    ignore=[404])
    entity = unpack_result(result)
    if entity is not None:
        entity.pop('text', None)
    return entity
Пример #7
0
def upgrade_search():
    """Add any missing properties to the index mappings."""
    INDEXES = [
        (collection_index(), collection_type(), COLLECTION_MAPPING),
        (entity_index(), entity_type(), ENTITY_MAPPING),
        (record_index(), record_type(), RECORD_MAPPING),
    ]
    for (index, doc_type, mapping) in INDEXES:
        log.info("Creating index: %s (%s)", index, doc_type)
        es.indices.create(index, ignore=[404, 400])
        es.indices.put_mapping(index=index, doc_type=doc_type, body=mapping)
        es.indices.open(index=index, ignore=[400, 404])
        es.indices.refresh(index=index)
Пример #8
0
def get_collection_stats(collection_id):
    """Compute some statistics on the content of a collection."""
    query = {
        'size': 0,
        'query': {
            'term': {
                'collection_id': collection_id
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            },
            'countries': {
                'terms': {
                    'field': 'countries',
                    'size': 250
                }
            },
            'languages': {
                'terms': {
                    'field': 'languages',
                    'size': 100
                }
            },
        }
    }
    result = es.search(index=entity_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'schemata': {}, 'count': result['hits']['total']}

    # expose entities by schema count.
    for schema in aggregations['schema']['buckets']:
        data['schemata'][schema['key']] = schema['doc_count']

    # if no countries or langs are given, take the most common from the data.
    if not len(data.get('countries', [])):
        countries = aggregations['countries']['buckets']
        data['countries'] = [c['key'] for c in countries]

    if not len(data.get('languages', [])):
        countries = aggregations['languages']['buckets']
        data['languages'] = [c['key'] for c in countries]

    # pprint(data)
    return data
Пример #9
0
def update_roles(collection):
    """Update the role visibility of objects which are part of collections."""
    roles = ', '.join([str(r) for r in collection.roles])
    body = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        'script': {
            'inline': 'ctx._source.roles = [%s]' % roles
        }
    }
    es.update_by_query(index=entity_index(),
                       doc_type=entity_type(),
                       body=body,
                       wait_for_completion=False)
Пример #10
0
def xref_collection(collection, other=None):
    """Cross-reference all the entities and documents in a collection."""
    log.info("Cross-reference collection: %r", collection)
    other_id = other.id if other is not None else None
    query = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        '_source': FIELDS_XREF
    }
    scanner = scan(es,
                   index=entities_index(),
                   doc_type=entity_type(),
                   query=query,
                   scroll='15m',
                   size=1000)

    for i, res in enumerate(scanner):
        # xref_item.delay(unpack_result(res), other_id)
        xref_item(unpack_result(res), other_id)
Пример #11
0
def index_document(document):
    if document.status == Document.STATUS_PENDING:
        return

    log.info("Index document [%s]: %s", document.id, document.title)
    schema = model.get(Document.SCHEMA)
    data = {
        'schema': schema.name,
        'schemata': schema.names,
        'collection_id': document.collection_id,
        'roles': document.collection.roles,
        'type': document.type,
        'status': document.status,
        'content_hash': document.content_hash,
        'foreign_id': document.foreign_id,
        'error_message': document.error_message,
        'uploader_id': document.uploader_id,
        'created_at': document.created_at,
        'updated_at': document.updated_at,
        'title': document.title,
        'name': document.title,
        'summary': document.summary,
        'author': document.author,
        'file_size': document.file_size,
        'file_name': document.file_title,
        'source_url': document.source_url,
        'languages': document.languages,
        'countries': document.countries,
        'keywords': document.keywords,
        'dates': document.dates,
        'extension': document.extension,
        'encoding': document.encoding,
        'mime_type': document.mime_type,
        'pdf_version': document.pdf_version,
        'columns': document.columns,
        'children': document.children.count(),
        'text': index_form(document.texts)
    }
    if document.parent_id is not None:
        data['parent'] = {
            'id': document.parent_id,
            'type': document.parent.type,
            'title': document.parent.title,
        }

    q = db.session.query(DocumentTag)
    q = q.filter(DocumentTag.document_id == document.id)
    for tag in q.yield_per(5000):
        field = TAG_FIELDS.get(tag.type)
        if field is None:
            log.warning("Cannot index document tag: %r", tag)
            continue
        if field not in data:
            data[field] = []
        data[field].append(tag.text)

    index_names(data)
    data = clean_dict(data)
    # pprint(data)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             body=data,
             id=document.id)
    data['id'] = document.id
    return data
Пример #12
0
def delete_document(document_id):
    clear_records(document_id)
    es.delete(index=entities_index(),
              doc_type=entity_type(),
              id=document_id,
              ignore=[404])
Пример #13
0
 def get_doc_type(self):
     return entity_type()
Пример #14
0
def delete_entity(entity_id):
    """Delete an entity from the index."""
    es.delete(index=entities_index(),
              doc_type=entity_type(),
              id=entity_id,
              ignore=[404])