Exemplo n.º 1
0
 def test_load_pdf_file(self):
     pdf_path = self.get_fixture_path('demo.pdf')
     document = Document.by_keys(collection_id=self.collection.id,
                                 foreign_id='demo.pdf')
     db.session.commit()
     db.session.refresh(document)
     ingest_document(document, pdf_path)
     assert Document.all().count() == 1, Document.all().count()
Exemplo n.º 2
0
 def test_load_sample_directory(self):
     samples_path = self.get_fixture_path('samples')
     document = Document.by_keys(collection_id=self.collection.id,
                                 foreign_id='samples')
     db.session.commit()
     db.session.refresh(document)
     ingest_document(document, samples_path)
     assert Document.all().count() == 5, Document.all().count()
Exemplo n.º 3
0
Arquivo: crawler.py Projeto: 01-/aleph
 def foreign_id_exists(self, source, foreign_id):
     q = Document.all_ids().filter(Document.source_id == source.id)
     q = q.filter(Document.foreign_id == foreign_id)
     exists = q.first() is not None
     if exists:
         log.info("Foreign ID exists (%s): %s", source, foreign_id)
     return exists
Exemplo n.º 4
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    try:
        log.info("Index document: %r", document)
        data = document.to_index_dict()
        data['entities'] = generate_entities(document)
        data['title_latin'] = latinize_text(data.get('title'))
        data['summary_latin'] = latinize_text(data.get('summary'))
        get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                       id=document.id)

        clear_children(document)
        if document.type == Document.TYPE_TEXT:
            bulk(get_es(), generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(get_es(), generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INDEX, component=__name__,
                          document_id=document.id, meta=document.meta,
                          source_id=document.source_id, exception=ex)
Exemplo n.º 5
0
def ingest_upload(collection_id):
    require(request.authz.can(collection_id, request.authz.WRITE))
    sync = get_flag('sync')
    meta, foreign_id = _load_metadata()
    parent_id = _load_parent(collection_id, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        path = None
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
        document = Document.by_keys(collection_id=collection_id,
                                    parent_id=parent_id,
                                    foreign_id=foreign_id,
                                    content_hash=content_hash)
        document.update(meta)
        document.schema = Document.SCHEMA
        if content_hash is None:
            document.schema = Document.SCHEMA_FOLDER
        ingest_document(document, path,
                        role_id=request.authz.id,
                        content_hash=content_hash)
    finally:
        shutil.rmtree(upload_dir)

    if document.collection.casefile:
        # Make sure collection counts are always accurate.
        update_document(document, sync=sync)
    return jsonify({
        'status': 'ok',
        'id': stringify(document.id)
    }, status=201)
Exemplo n.º 6
0
def get_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        raise NotFound()
    readable = [c for c in document.collection_ids if authz.collection_read(c)]
    authz.require(len(readable))
    return document
Exemplo n.º 7
0
def index_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data,
             id=document.id)
    clear_children(document)

    try:
        if document.type == Document.TYPE_TEXT:
            bulk(es, generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(es, generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
Exemplo n.º 8
0
def index():
    sources_ids = match_ids('sources', authz.sources(authz.READ))
    q = Document.all().filter(Document.source_id.in_(sources_ids))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Exemplo n.º 9
0
 def create_document(self, meta, type=None):
     if meta.content_hash:
         q = Document.all()
         if meta.foreign_id:
             q = q.filter(Document.foreign_id == meta.foreign_id)
         else:
             q = q.filter(Document.content_hash == meta.content_hash)
         q = q.filter(Document.source_id == self.source_id)
         document = q.first()
     if document is None:
         document = Document()
         document.source_id = self.source_id
     document.meta = meta
     document.type = type or self.DOCUMENT_TYPE
     db.session.add(document)
     db.session.flush()
     return document
Exemplo n.º 10
0
 def load_fixtures(self, file_name, process_documents=True):
     filepath = self.get_fixture_path(file_name)
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     if process_documents:
         for doc_id, in Document.all_ids():
             analyze_document(doc_id)
         optimize_search()
Exemplo n.º 11
0
Arquivo: util.py Projeto: 01-/aleph
 def load_fixtures(self, file_name, process_documents=True):
     filepath = os.path.abspath(os.path.join(FIXTURES, file_name))
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     if process_documents:
         for doc_id, in Document.all_ids():
             analyze_document(doc_id)
         optimize_search()
Exemplo n.º 12
0
def load_documents():
    graph = get_graph()
    tx = graph.begin()
    for i, document in enumerate(Document.all()):
        load_document(tx, document)
        if i > 0 and i % 1000 == 0:
            tx.commit()
            tx = graph.begin()
    tx.commit()
Exemplo n.º 13
0
 def create_document(self, meta, type=None):
     if meta.content_hash:
         q = Document.all()
         if meta.foreign_id:
             q = q.filter(Document.foreign_id == meta.foreign_id)
         else:
             q = q.filter(Document.content_hash == meta.content_hash)
         clause = Collection.id == self.collection_id
         q = q.filter(Document.collections.any(clause))
         document = q.first()
     if document is None:
         document = Document()
         document.collections = [Collection.by_id(self.collection_id)]
     document.meta = meta
     document.type = type or self.DOCUMENT_TYPE
     db.session.add(document)
     db.session.flush()
     return document
Exemplo n.º 14
0
def index():
    collection_ids = match_ids('collection', authz.collections(authz.READ))
    q = Document.all()
    clause = Collection.id.in_(collection_ids)
    q = q.filter(Document.collections.any(clause))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Exemplo n.º 15
0
def analyze_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        cls().analyze(document, document.meta)
    index_document(document_id)
Exemplo n.º 16
0
def process_documents(collection_id=None, failed_only=False):
    """Re-ingest or re-index all documents. Can be filtered to cover only
    documents which failed to properly import last time, or those which
    are part of a particular collection."""
    q = Document.find_ids(collection_id=collection_id,
                          failed_only=failed_only)
    for idx, (doc_id,) in enumerate(q.yield_per(5000), 1):
        ingest.apply_async([doc_id], {'refresh': True}, priority=1)
        if idx % 10000 == 0:
            log.info("Process: %s documents...", idx)
Exemplo n.º 17
0
def load_documents():
    graph = get_graph()
    tx = graph.begin()
    for i, document in enumerate(Document.all()):
        log.info("Load doc [%s]: %r", document.id, document.meta)
        load_document(tx, document)
        if i > 0 and i % 1000 == 0:
            tx.commit()
            tx = graph.begin()
    tx.commit()
Exemplo n.º 18
0
    def test_load_csv_file(self):
        csv_path = self.get_fixture_path('experts.csv')
        crawler = DirectoryCrawler()
        crawler.execute(directory=csv_path)
        assert Document.all().count() == 1, Document.all().count()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 14, len(records)
        rec0 = records[0]
        assert str(rec0.id) in repr(rec0), repr(rec0)
        assert 'experts.csv' in rec0.document.meta.file_name, \
            rec0.document.meta
        assert 'nationality' in rec0.data, rec0.data
        assert 'name' in rec0.data, rec0.data

        doc = rec0.document
        assert 'experts' in repr(doc)

        doc.delete_records()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 0, len(records)
Exemplo n.º 19
0
    def test_load_csv_file(self):
        csv_path = self.get_fixture_path('experts.csv')
        document = Document.by_keys(collection_id=self.collection.id,
                                    foreign_id='experts.csv')
        document.file_name = 'experts.csv'
        db.session.commit()
        db.session.refresh(document)
        ingest_document(document, csv_path)
        assert Document.all().count() == 1, Document.all().count()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 14, len(records)
        rec0 = records[0]
        assert str(rec0.id) in repr(rec0), repr(rec0)
        assert 'nationality' in rec0.data, rec0.data
        assert 'name' in rec0.data, rec0.data

        doc = rec0.document
        doc.delete_records()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 0, len(records)
Exemplo n.º 20
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    try:
        for cls in get_analyzers():
            cls().analyze(document, document.meta)
    except Exception as ex:
        log.exception(ex)
    index_document(document_id)
Exemplo n.º 21
0
def index(foreign_id=None):
    """Index documents in the given source (or throughout)."""
    q = Document.all_ids()
    if foreign_id:
        source = Source.by_foreign_id(foreign_id)
        if source is None:
            raise ValueError("No such source: %r" % foreign_id)
        q = q.filter(Document.source_id == source.id)
    for doc_id, in q:
        index_document.delay(doc_id)
    if foreign_id is None:
        reindex_entities()
Exemplo n.º 22
0
def generate_collection_docs(collection):
    q = Document.by_collection(collection.id)
    q = q.order_by(Document.id.asc())
    for idx, document in enumerate(q.yield_per(BULK_PAGE)):
        try:
            log.info("Index [%s]: %s", document.id, document.name)
            yield from generate_document(document)
        except Exception:
            log.exception("Cannot index [%s]: %s", document.id, document.name)

        if idx % 1000 == 0:
            db.session.expunge_all()
Exemplo n.º 23
0
def _load_parent(collection_id, meta):
    """Determine the parent document for the document that is to be
    ingested."""
    parent_id = meta.get('parent_id')
    if parent_id is None:
        return
    parent = Document.by_id(parent_id, collection_id=collection_id)
    if parent is None:
        raise BadRequest(response=jsonify({
            'status': 'error',
            'message': 'Cannot load parent document'
        }, status=400))
    return parent_id
Exemplo n.º 24
0
def index(foreign_id=None):
    """Index documents in the given collection (or throughout)."""
    q = Document.all_ids()
    if foreign_id:
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            raise ValueError("No such collection: %r" % foreign_id)
        clause = Collection.id == collection.id
        q = q.filter(Document.collections.any(clause))
    for doc_id, in q:
        index_document_id.delay(doc_id)
    if foreign_id is None:
        reindex_entities()
Exemplo n.º 25
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    clear_records(document)
    bulk_op(generate_records(document))
Exemplo n.º 26
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        try:
            cls().analyze(document, document.meta)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.ANALYZE, component=cls.__name__,
                              document_id=document.id, meta=document.meta,
                              source_id=document.source_id, exception=ex)
    index_document(document_id)
Exemplo n.º 27
0
def index():
    try:
        authorized = authz.collections(authz.READ)
        collection_ids = [int(f) for f in request.args.getlist('collection')]
        collection_ids = collection_ids or authorized
        collection_ids = [c for c in collection_ids if c in authorized]
    except ValueError:
        raise BadRequest()
    q = Document.all()
    clause = Collection.id.in_(collection_ids)
    q = q.filter(Document.collections.any(clause))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Exemplo n.º 28
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    clear_records(document)
    bulk(get_es(), generate_records(document), stats_only=True,
         chunk_size=2000, request_timeout=60.0)
Exemplo n.º 29
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.

    log.info("Updating document references: %r", entity)
    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity.id)
Exemplo n.º 30
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)
Exemplo n.º 31
0
def ingest_upload(collection_id):
    """
    ---
    post:
      summary: Upload a document to a collection
      description: Upload a document to a collection with id `collection_id`
      parameters:
      - in: path
        name: collection_id
        required: true
        schema:
          type: integer
      requestBody:
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                file:
                  type: string
                  format: binary
                  description: The document to upload
                meta:
                  $ref: '#/components/schemas/DocumentIngest'
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                properties:
                  id:
                    description: id of the uploaded document
                    type: integer
                  status:
                    type: string
                type: object
      tags:
      - Ingest
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    job_id = get_session_id()
    sync = get_flag('sync', default=False)
    index = get_flag('index', default=True)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 role_id=request.authz.id)
        collection.touch()
        db.session.commit()
        proxy = document.to_proxy(ns=collection.ns)
        if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync and index:
            index_proxy(collection, proxy, sync=sync)
        ingest_entity(collection, proxy, job_id=job_id, index=index)
        _notify(collection, proxy.id)
        return jsonify({'status': 'ok', 'id': proxy.id}, status=201)
    finally:
        shutil.rmtree(upload_dir)