Python entity_type примеры, aleph.index.core.entity_type Python примеры использования

Пример #1

0

Показать файл

Файл: entities.py Проект: GelLiNN/aleph

def index_entity(entity):
    """Index an entity."""
    if entity.deleted_at is not None:
        return delete_entity(entity.id)

    data = {
        'foreign_ids': entity.foreign_ids,
        'data': entity.data,
        'created_at': entity.created_at,
        'updated_at': entity.updated_at,
        'bulk': False,
        'roles': entity.collection.roles,
        'collection_id': entity.collection_id,
        'properties': {
            'name': [entity.name]
        }
    }

    for k, v in entity.data.items():
        data['properties'][k] = ensure_list(v)

    # data['$documents'] = get_count(entity)
    data = finalize_index(data, entity.schema)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             id=entity.id,
             body=data)
    data['id'] = entity.id
    return data

Пример #2

0

Показать файл

Файл: __init__.py Проект: GelLiNN/aleph

    def __init__(self, request, query, parser=None, schema=None):
        super(MatchQueryResult, self).__init__(request,
                                               query,
                                               parser=parser,
                                               schema=schema)
        ids = set()
        for match in self.results:
            ids.add(match.match_id)
            ids.add(match.entity_id)
        ids = {'ids': list(ids)}

        result = es.mget(index=entities_index(),
                         doc_type=entity_type(),
                         body=ids)
        for doc in result.get('docs', []):
            entity = unpack_result(doc)
            if entity is None:
                continue
            for match in self.results:
                if match.match_id == entity['id']:
                    match.match = entity
                if match.entity_id == entity['id']:
                    match.entity = entity

        # Do not return results if the entity has been removed in the mean
        # time. Not sure this is the ideal way of doing this, as it'll mess
        # with pagination counts etc.
        for match in list(self.results):
            if not hasattr(match, 'match') or not hasattr(match, 'entity'):
                self.results.remove(match)

Пример #3

0

Показать файл

Файл: stats.py Проект: GelLiNN/aleph

def get_instance_stats(authz):
    query = {
        'size': 0,
        'query': {
            'terms': {
                'roles': list(authz.roles)
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            }
        }
    }
    result = es.search(index=entities_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'count': result.get('hits').get('total'), 'schemata': {}}
    for schema in aggregations.get('schema').get('buckets'):
        key = schema.get('key')
        data['schemata'][key] = schema.get('doc_count')

    return data

Пример #4

0

Показать файл

Файл: entities.py Проект: GelLiNN/aleph

def _index_updates(collection, entities):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    common = {
        'collection_id': collection.id,
        'bulk': True,
        'roles': collection.roles,
        'updated_at': datetime.utcnow()
    }
    if not len(entities):
        return

    result = es.mget(index=entities_index(),
                     doc_type=entity_type(),
                     body={'ids': entities.keys()},
                     _source=['schema', 'properties', 'created_at'])
    for doc in result.get('docs', []):
        if not doc.get('found', False):
            continue
        entity_id = doc['_id']
        entity = entities.get(entity_id)
        existing = doc.get('_source')
        combined = model.merge(existing, entity)
        combined['created_at'] = existing.get('created_at')
        entities[entity_id] = combined

    for doc_id, entity in entities.items():
        entity.pop('id', None)
        entity.pop('data', None)
        entity.update(common)
        if 'created_at' not in entity:
            entity['created_at'] = entity.get('updated_at')
        schema = model.get(entity.get('schema'))
        entity = finalize_index(entity, schema)
        # pprint(entity)
        yield {
            '_id': doc_id,
            '_index': entity_index(),
            '_type': entity_type(),
            '_source': entity
        }

Пример #5

0

Показать файл

Файл: documents.py Проект: GelLiNN/aleph

def get_document(document_id):
    """Fetch a document from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=document_id,
                    ignore=[404])
    document = unpack_result(result)
    if document is not None:
        document.pop('text', None)
    return document

Пример #6

0

Показать файл

Файл: entities.py Проект: GelLiNN/aleph

def get_entity(entity_id):
    """Fetch an entity from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=entity_id,
                    ignore=[404])
    entity = unpack_result(result)
    if entity is not None:
        entity.pop('text', None)
    return entity

Пример #7

0

Показать файл

def upgrade_search():
    """Add any missing properties to the index mappings."""
    INDEXES = [
        (collection_index(), collection_type(), COLLECTION_MAPPING),
        (entity_index(), entity_type(), ENTITY_MAPPING),
        (record_index(), record_type(), RECORD_MAPPING),
    ]
    for (index, doc_type, mapping) in INDEXES:
        log.info("Creating index: %s (%s)", index, doc_type)
        es.indices.create(index, ignore=[404, 400])
        es.indices.put_mapping(index=index, doc_type=doc_type, body=mapping)
        es.indices.open(index=index, ignore=[400, 404])
        es.indices.refresh(index=index)

Пример #8

0

Показать файл

Файл: stats.py Проект: GelLiNN/aleph

def get_collection_stats(collection_id):
    """Compute some statistics on the content of a collection."""
    query = {
        'size': 0,
        'query': {
            'term': {
                'collection_id': collection_id
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            },
            'countries': {
                'terms': {
                    'field': 'countries',
                    'size': 250
                }
            },
            'languages': {
                'terms': {
                    'field': 'languages',
                    'size': 100
                }
            },
        }
    }
    result = es.search(index=entity_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'schemata': {}, 'count': result['hits']['total']}

    # expose entities by schema count.
    for schema in aggregations['schema']['buckets']:
        data['schemata'][schema['key']] = schema['doc_count']

    # if no countries or langs are given, take the most common from the data.
    if not len(data.get('countries', [])):
        countries = aggregations['countries']['buckets']
        data['countries'] = [c['key'] for c in countries]

    if not len(data.get('languages', [])):
        countries = aggregations['languages']['buckets']
        data['languages'] = [c['key'] for c in countries]

    # pprint(data)
    return data

Пример #9

0

Показать файл

Файл: collections.py Проект: GelLiNN/aleph

def update_roles(collection):
    """Update the role visibility of objects which are part of collections."""
    roles = ', '.join([str(r) for r in collection.roles])
    body = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        'script': {
            'inline': 'ctx._source.roles = [%s]' % roles
        }
    }
    es.update_by_query(index=entity_index(),
                       doc_type=entity_type(),
                       body=body,
                       wait_for_completion=False)

Пример #10

0

Показать файл

Файл: xref.py Проект: GelLiNN/aleph

def xref_collection(collection, other=None):
    """Cross-reference all the entities and documents in a collection."""
    log.info("Cross-reference collection: %r", collection)
    other_id = other.id if other is not None else None
    query = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        '_source': FIELDS_XREF
    }
    scanner = scan(es,
                   index=entities_index(),
                   doc_type=entity_type(),
                   query=query,
                   scroll='15m',
                   size=1000)

    for i, res in enumerate(scanner):
        # xref_item.delay(unpack_result(res), other_id)
        xref_item(unpack_result(res), other_id)

Пример #11

0

Показать файл

Файл: documents.py Проект: GelLiNN/aleph

def index_document(document):
    if document.status == Document.STATUS_PENDING:
        return

    log.info("Index document [%s]: %s", document.id, document.title)
    schema = model.get(Document.SCHEMA)
    data = {
        'schema': schema.name,
        'schemata': schema.names,
        'collection_id': document.collection_id,
        'roles': document.collection.roles,
        'type': document.type,
        'status': document.status,
        'content_hash': document.content_hash,
        'foreign_id': document.foreign_id,
        'error_message': document.error_message,
        'uploader_id': document.uploader_id,
        'created_at': document.created_at,
        'updated_at': document.updated_at,
        'title': document.title,
        'name': document.title,
        'summary': document.summary,
        'author': document.author,
        'file_size': document.file_size,
        'file_name': document.file_title,
        'source_url': document.source_url,
        'languages': document.languages,
        'countries': document.countries,
        'keywords': document.keywords,
        'dates': document.dates,
        'extension': document.extension,
        'encoding': document.encoding,
        'mime_type': document.mime_type,
        'pdf_version': document.pdf_version,
        'columns': document.columns,
        'children': document.children.count(),
        'text': index_form(document.texts)
    }
    if document.parent_id is not None:
        data['parent'] = {
            'id': document.parent_id,
            'type': document.parent.type,
            'title': document.parent.title,
        }

    q = db.session.query(DocumentTag)
    q = q.filter(DocumentTag.document_id == document.id)
    for tag in q.yield_per(5000):
        field = TAG_FIELDS.get(tag.type)
        if field is None:
            log.warning("Cannot index document tag: %r", tag)
            continue
        if field not in data:
            data[field] = []
        data[field].append(tag.text)

    index_names(data)
    data = clean_dict(data)
    # pprint(data)
    es.index(index=entity_index(),
             doc_type=entity_type(),
             body=data,
             id=document.id)
    data['id'] = document.id
    return data

Пример #12

0

Показать файл

Файл: documents.py Проект: GelLiNN/aleph

def delete_document(document_id):
    clear_records(document_id)
    es.delete(index=entities_index(),
              doc_type=entity_type(),
              id=document_id,
              ignore=[404])

Пример #13

0

Показать файл

Файл: __init__.py Проект: GelLiNN/aleph

 def get_doc_type(self):
     return entity_type()

Пример #14

0

Показать файл

Файл: entities.py Проект: GelLiNN/aleph

def delete_entity(entity_id):
    """Delete an entity from the index."""
    es.delete(index=entities_index(),
              doc_type=entity_type(),
              id=entity_id,
              ignore=[404])

Python entity_type примеры использования