Exemplos de entity_type em Python, exemplos de aleph.index.core.entity_type em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: entities.py Projeto: GelLiNN/aleph

def index_entity(entity):
    """Index an entity."""
    if entity.deleted_at is not None:
        return delete_entity(entity.id)

    data = {
        'foreign_ids': entity.foreign_ids,
        'data': entity.data,
        'created_at': entity.created_at,
        'updated_at': entity.updated_at,
        'bulk': False,
        'roles': entity.collection.roles,
        'collection_id': entity.collection_id,
        'properties': {
            'name': [entity.name]
        }
    }

    for k, v in entity.data.items():
        data['properties'][k] = ensure_list(v)

    # data['$documents'] = get_count(entity)
    data = finalize_index(data, entity.schema)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             id=entity.id,
             body=data)
    data['id'] = entity.id
    return data

Exemplo n.º 2

0

Exibir arquivo

Arquivo: __init__.py Projeto: GelLiNN/aleph

    def __init__(self, request, query, parser=None, schema=None):
        super(MatchQueryResult, self).__init__(request,
                                               query,
                                               parser=parser,
                                               schema=schema)
        ids = set()
        for match in self.results:
            ids.add(match.match_id)
            ids.add(match.entity_id)
        ids = {'ids': list(ids)}

        result = es.mget(index=entities_index(),
                         doc_type=entity_type(),
                         body=ids)
        for doc in result.get('docs', []):
            entity = unpack_result(doc)
            if entity is None:
                continue
            for match in self.results:
                if match.match_id == entity['id']:
                    match.match = entity
                if match.entity_id == entity['id']:
                    match.entity = entity

        # Do not return results if the entity has been removed in the mean
        # time. Not sure this is the ideal way of doing this, as it'll mess
        # with pagination counts etc.
        for match in list(self.results):
            if not hasattr(match, 'match') or not hasattr(match, 'entity'):
                self.results.remove(match)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: stats.py Projeto: GelLiNN/aleph

def get_instance_stats(authz):
    query = {
        'size': 0,
        'query': {
            'terms': {
                'roles': list(authz.roles)
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            }
        }
    }
    result = es.search(index=entities_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'count': result.get('hits').get('total'), 'schemata': {}}
    for schema in aggregations.get('schema').get('buckets'):
        key = schema.get('key')
        data['schemata'][key] = schema.get('doc_count')

    return data

Exemplo n.º 4

0

Exibir arquivo

Arquivo: entities.py Projeto: GelLiNN/aleph

def _index_updates(collection, entities):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    common = {
        'collection_id': collection.id,
        'bulk': True,
        'roles': collection.roles,
        'updated_at': datetime.utcnow()
    }
    if not len(entities):
        return

    result = es.mget(index=entities_index(),
                     doc_type=entity_type(),
                     body={'ids': entities.keys()},
                     _source=['schema', 'properties', 'created_at'])
    for doc in result.get('docs', []):
        if not doc.get('found', False):
            continue
        entity_id = doc['_id']
        entity = entities.get(entity_id)
        existing = doc.get('_source')
        combined = model.merge(existing, entity)
        combined['created_at'] = existing.get('created_at')
        entities[entity_id] = combined

    for doc_id, entity in entities.items():
        entity.pop('id', None)
        entity.pop('data', None)
        entity.update(common)
        if 'created_at' not in entity:
            entity['created_at'] = entity.get('updated_at')
        schema = model.get(entity.get('schema'))
        entity = finalize_index(entity, schema)
        # pprint(entity)
        yield {
            '_id': doc_id,
            '_index': entity_index(),
            '_type': entity_type(),
            '_source': entity
        }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: documents.py Projeto: GelLiNN/aleph

def get_document(document_id):
    """Fetch a document from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=document_id,
                    ignore=[404])
    document = unpack_result(result)
    if document is not None:
        document.pop('text', None)
    return document

Exemplo n.º 6

0

Exibir arquivo

Arquivo: entities.py Projeto: GelLiNN/aleph

def get_entity(entity_id):
    """Fetch an entity from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=entity_id,
                    ignore=[404])
    entity = unpack_result(result)
    if entity is not None:
        entity.pop('text', None)
    return entity

Exemplo n.º 7

0

Exibir arquivo

def upgrade_search():
    """Add any missing properties to the index mappings."""
    INDEXES = [
        (collection_index(), collection_type(), COLLECTION_MAPPING),
        (entity_index(), entity_type(), ENTITY_MAPPING),
        (record_index(), record_type(), RECORD_MAPPING),
    ]
    for (index, doc_type, mapping) in INDEXES:
        log.info("Creating index: %s (%s)", index, doc_type)
        es.indices.create(index, ignore=[404, 400])
        es.indices.put_mapping(index=index, doc_type=doc_type, body=mapping)
        es.indices.open(index=index, ignore=[400, 404])
        es.indices.refresh(index=index)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: stats.py Projeto: GelLiNN/aleph

def get_collection_stats(collection_id):
    """Compute some statistics on the content of a collection."""
    query = {
        'size': 0,
        'query': {
            'term': {
                'collection_id': collection_id
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            },
            'countries': {
                'terms': {
                    'field': 'countries',
                    'size': 250
                }
            },
            'languages': {
                'terms': {
                    'field': 'languages',
                    'size': 100
                }
            },
        }
    }
    result = es.search(index=entity_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'schemata': {}, 'count': result['hits']['total']}

    # expose entities by schema count.
    for schema in aggregations['schema']['buckets']:
        data['schemata'][schema['key']] = schema['doc_count']

    # if no countries or langs are given, take the most common from the data.
    if not len(data.get('countries', [])):
        countries = aggregations['countries']['buckets']
        data['countries'] = [c['key'] for c in countries]

    if not len(data.get('languages', [])):
        countries = aggregations['languages']['buckets']
        data['languages'] = [c['key'] for c in countries]

    # pprint(data)
    return data

Exemplo n.º 9

0

Exibir arquivo

Arquivo: collections.py Projeto: GelLiNN/aleph

def update_roles(collection):
    """Update the role visibility of objects which are part of collections."""
    roles = ', '.join([str(r) for r in collection.roles])
    body = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        'script': {
            'inline': 'ctx._source.roles = [%s]' % roles
        }
    }
    es.update_by_query(index=entity_index(),
                       doc_type=entity_type(),
                       body=body,
                       wait_for_completion=False)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: xref.py Projeto: GelLiNN/aleph

def xref_collection(collection, other=None):
    """Cross-reference all the entities and documents in a collection."""
    log.info("Cross-reference collection: %r", collection)
    other_id = other.id if other is not None else None
    query = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        '_source': FIELDS_XREF
    }
    scanner = scan(es,
                   index=entities_index(),
                   doc_type=entity_type(),
                   query=query,
                   scroll='15m',
                   size=1000)

    for i, res in enumerate(scanner):
        # xref_item.delay(unpack_result(res), other_id)
        xref_item(unpack_result(res), other_id)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: documents.py Projeto: GelLiNN/aleph

def index_document(document):
    if document.status == Document.STATUS_PENDING:
        return

    log.info("Index document [%s]: %s", document.id, document.title)
    schema = model.get(Document.SCHEMA)
    data = {
        'schema': schema.name,
        'schemata': schema.names,
        'collection_id': document.collection_id,
        'roles': document.collection.roles,
        'type': document.type,
        'status': document.status,
        'content_hash': document.content_hash,
        'foreign_id': document.foreign_id,
        'error_message': document.error_message,
        'uploader_id': document.uploader_id,
        'created_at': document.created_at,
        'updated_at': document.updated_at,
        'title': document.title,
        'name': document.title,
        'summary': document.summary,
        'author': document.author,
        'file_size': document.file_size,
        'file_name': document.file_title,
        'source_url': document.source_url,
        'languages': document.languages,
        'countries': document.countries,
        'keywords': document.keywords,
        'dates': document.dates,
        'extension': document.extension,
        'encoding': document.encoding,
        'mime_type': document.mime_type,
        'pdf_version': document.pdf_version,
        'columns': document.columns,
        'children': document.children.count(),
        'text': index_form(document.texts)
    }
    if document.parent_id is not None:
        data['parent'] = {
            'id': document.parent_id,
            'type': document.parent.type,
            'title': document.parent.title,
        }

    q = db.session.query(DocumentTag)
    q = q.filter(DocumentTag.document_id == document.id)
    for tag in q.yield_per(5000):
        field = TAG_FIELDS.get(tag.type)
        if field is None:
            log.warning("Cannot index document tag: %r", tag)
            continue
        if field not in data:
            data[field] = []
        data[field].append(tag.text)

    index_names(data)
    data = clean_dict(data)
    # pprint(data)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             body=data,
             id=document.id)
    data['id'] = document.id
    return data

Exemplo n.º 12

0

Exibir arquivo

Arquivo: documents.py Projeto: GelLiNN/aleph

def delete_document(document_id):
    clear_records(document_id)
    es.delete(index=entities_index(),
              doc_type=entity_type(),
              id=document_id,
              ignore=[404])

Exemplo n.º 13

0

Exibir arquivo

Arquivo: __init__.py Projeto: GelLiNN/aleph

 def get_doc_type(self):
     return entity_type()

Exemplo n.º 14

0

Exibir arquivo

Arquivo: entities.py Projeto: GelLiNN/aleph

def delete_entity(entity_id):
    """Delete an entity from the index."""
    es.delete(index=entities_index(),
              doc_type=entity_type(),
              id=entity_id,
              ignore=[404])