Пример #1
0
def crawldir(directory, language=None, country=None, foreign_id=None):
    """Crawl the given directory."""
    if directory is None or not os.path.exists(directory):
        log.error("Invalid directory: %r", directory)
        return
    directory = os.path.abspath(os.path.normpath(directory))

    if foreign_id is None:
        foreign_id = 'directory:%s' % slugify(directory)
    collection = Collection.by_foreign_id(foreign_id)
    if collection is None:
        collection = Collection.create({
            'foreign_id': foreign_id,
            'label': directory,
            'managed': True
        })

    if language is not None:
        collection.languages = [language]
    if country is not None:
        collection.countries = [country]
    db.session.commit()
    update_collection(collection)

    log.info('Crawling %r to %r...', directory, collection.foreign_id)
    document = Document.by_keys(collection=collection,
                                foreign_id=directory)
    ingest_document(document, directory)
Пример #2
0
def crawldir(path, language=None, country=None, foreign_id=None):
    """Crawl the given directory."""
    path = decode_path(path)
    if path is None or not os.path.exists(path):
        log.error("Invalid path: %r", path)
        return
    path = os.path.abspath(os.path.normpath(path))
    path_name = os.path.basename(path)

    if foreign_id is None:
        foreign_id = 'directory:%s' % slugify(path)
    collection = Collection.by_foreign_id(foreign_id)
    if collection is None:
        collection = Collection.create({
            'foreign_id': foreign_id,
            'label': path_name,
            'managed': True
        })

    if language is not None:
        collection.languages = [language]
    if country is not None:
        collection.countries = [country]
    db.session.commit()
    update_collection(collection)

    log.info('Crawling %r to %r...', path, collection.foreign_id)
    document = Document.by_keys(collection=collection,
                                foreign_id=path)
    document.file_name = path_name
    ingest_document(document, path)
Пример #3
0
def ingest_upload(id):
    collection = obj_or_404(Collection.by_id(id))
    require(request.authz.can_write(collection.id))

    try:
        meta = json.loads(request.form.get('meta', '{}'))
    except Exception as ex:
        raise BadRequest(unicode(ex))
    validate_data(meta, DocumentSchema)

    documents = []
    for storage in request.files.values():
        sec_fn = os.path.join(upload_folder, secure_filename(storage.filename))
        storage.save(sec_fn)
        content_hash = checksum(sec_fn)
        document = Document.by_keys(collection=collection,
                                    content_hash=content_hash)
        document.mime_type = storage.mimetype
        document.file_name = storage.filename
        document.update(meta)
        ingest_document(document, sec_fn, role_id=request.authz.id)
        os.unlink(sec_fn)
        documents.append(document)

    return jsonify({
        'status':
        'ok',
        'documents': [DocumentSchema().dump(d).data for d in documents]
    })
Пример #4
0
def ingest_upload(collection_id):
    collection = obj_or_404(Collection.by_id(collection_id))
    request.authz.require(request.authz.collection_write(collection.id))
    log_event(request)
    crawler_run = make_textid()

    try:
        meta = json.loads(request.form.get('meta', '{}'))
    except Exception as ex:
        raise BadRequest(unicode(ex))

    documents = []
    for storage in request.files.values():
        sec_fn = os.path.join(upload_folder, secure_filename(storage.filename))
        storage.save(sec_fn)
        content_hash = checksum(sec_fn)
        document = Document.by_keys(collection=collection,
                                    content_hash=content_hash)
        document.crawler = 'user_upload:%s' % request.authz.role.id
        document.crawler_run = crawler_run
        document.mime_type = storage.mimetype
        document.file_name = storage.filename

        try:
            meta = json.loads(request.form.get('meta', '{}'))
            validate(meta, 'metadata.json#')
            document.meta.update(meta)
        except Exception as ex:
            raise BadRequest(unicode(ex))

        ingest_document(document, sec_fn, user_queue=True)
        os.unlink(sec_fn)
        documents.append(document)
    return jsonify({'status': 'ok', 'documents': documents})
Пример #5
0
 def test_load_sample_directory(self):
     samples_path = self.get_fixture_path('samples')
     document = Document.by_keys(collection=self.collection,
                                 foreign_id='samples')
     db.session.commit()
     db.session.refresh(document)
     ingest_document(document, samples_path)
     assert Document.all().count() == 5, Document.all().count()
Пример #6
0
 def test_load_pdf_file(self):
     pdf_path = self.get_fixture_path('demo.pdf')
     document = Document.by_keys(collection=self.collection,
                                 foreign_id='demo.pdf')
     db.session.commit()
     db.session.refresh(document)
     ingest_document(document, pdf_path)
     assert Document.all().count() == 1, Document.all().count()
Пример #7
0
 def emit_file(self, document, file_path, move=False):
     if isinstance(document, CrawlerMetadata):
         doc = self.create_document(foreign_id=document.foreign_id,
                                    content_hash=document.content_hash)
         doc.meta.update(document.meta)
         document = doc
     ingest_document(document, file_path)
     self.increment_count()
Пример #8
0
    def ingest_document(self,
                        document,
                        file_path=None,
                        role_id=None,
                        shallow=False):
        """Ingest a database-backed document.

        First retrieve its data and then call the actual ingestor.
        """
        # Work path will be used by storagelayer to cache a local
        # copy of data from an S3-based archive, and by ingestors
        # to perform processing and generate intermediary files.
        work_path = mkdtemp(prefix="aleph.ingest.")
        content_hash = document.content_hash
        if file_path is None and content_hash is not None:
            file_name = document.safe_file_name
            file_path = self.archive.load_file(content_hash,
                                               file_name=file_name,
                                               temp_path=work_path)

        if file_path is not None and not os.path.exists(file_path):
            # Probably indicative of file system encoding issues.
            log.error("Invalid path [%r]: %s", document, file_path)
            return

        try:
            if not len(document.languages):
                document.languages = document.collection.languages

            if not len(document.countries):
                document.countries = document.collection.countries

            result = DocumentResult(self,
                                    document,
                                    file_path=file_path,
                                    role_id=role_id)
            self.ingest(file_path, result=result)

            if not shallow and file_path is None:
                # When a directory is ingested, the data is not stored. Thus,
                # try to recurse on the database-backed known children.
                for child in Document.by_parent(document):
                    from aleph.ingest import ingest_document
                    ingest_document(child, None, role_id=role_id)
        finally:
            db.session.rollback()
            # Removing the temp_path given to storagelayer makes it redundant
            # to also call cleanup on the archive.
            remove_directory(work_path)
Пример #9
0
def ingest_upload(id):
    collection = obj_or_404(Collection.by_id(id))
    require(request.authz.can_write(collection.id))
    meta, foreign_id = _load_metadata()
    parent_id = _load_parent(collection, meta)
    upload_dir = mkdtemp()
    try:
        documents = []
        for storage in request.files.values():
            path = safe_filename(storage.filename)
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id,
                                        content_hash=content_hash)
            document.mime_type = storage.mimetype
            if storage.filename:
                document.file_name = os.path.basename(storage.filename)
            document.update(meta)
            ingest_document(document, path,
                            role_id=request.authz.id)
            documents.append(document)

        if not len(request.files):
            # If there is no files uploaded, try to create an empty
            # directory instead. Maybe this should be more explicit,
            # but it seemed like the most simple way of fitting it
            # into the API.
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id)
            document.update(meta)
            ingest_document(document, upload_dir,
                            role_id=request.authz.id)
            documents.append(document)
    finally:
        shutil.rmtree(upload_dir)

    # Update child counts in index.
    if parent_id is not None:
        index_document_id.delay(parent_id)

    return jsonify({
        'status': 'ok',
        'documents': [DocumentSchema().dump(d).data for d in documents]
    })
Пример #10
0
def ingest_upload(id):
    collection = get_db_collection(id, request.authz.WRITE)
    meta, foreign_id = _load_metadata(collection)
    parent_id = _load_parent(collection, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        documents = []
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id,
                                        content_hash=content_hash)
            document.update(meta)
            ingest_document(document, path, role_id=request.authz.id)
            documents.append(document)

        if not len(request.files):
            # If there is no files uploaded, try to create an empty
            # directory instead. Maybe this should be more explicit,
            # but it seemed like the most simple way of fitting it
            # into the API.
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id)
            document.schema = Document.SCHEMA_FOLDER
            document.update(meta)
            ingest_document(document,
                            None,
                            role_id=request.authz.id,
                            shallow=True)
            documents.append(document)
    finally:
        shutil.rmtree(upload_dir)

    # Update child counts in index.
    if parent_id is not None:
        index_document_id.apply_async([parent_id], priority=1)

    return jsonify({
        'status':
        'ok',
        'documents': [CombinedSchema().dump(d).data for d in documents]
    })
Пример #11
0
    def handle_child(self, parent, file_path, title=None, mime_type=None,
                     id=None, file_name=None):
        file_path = decode_path(file_path)
        assert id is not None, (parent, file_path)

        doc = Document.by_keys(parent_id=parent.document.id,
                               collection=parent.document.collection,
                               foreign_id=id)
        doc.title = title or doc.meta.get('title')
        doc.file_name = file_name or doc.meta.get('file_name')
        doc.mime_type = mime_type or doc.meta.get('mime_type')

        from aleph.ingest import ingest_document
        ingest_document(doc, file_path, role_id=parent.role_id)
        return DocumentResult(self, doc,
                              file_path=file_path,
                              role_id=parent.role_id)
Пример #12
0
    def handle_child(self, parent, file_path, title=None, mime_type=None,
                     id=None, file_name=None):
        file_path = decode_path(file_path)
        file_name = decode_path(file_name) or os.path.basename(file_path)

        content_hash = None
        if not os.path.isdir(file_path):
            content_hash = checksum(file_path)

        document = Document.by_keys(parent_id=parent.document.id,
                                    collection=parent.document.collection,
                                    foreign_id=id, content_hash=content_hash)
        document.title = title or document.meta.get('title')
        document.file_name = file_name or document.meta.get('file_name')
        document.mime_type = mime_type or document.meta.get('mime_type')

        from aleph.ingest import ingest_document
        ingest_document(document, file_path, user_queue=parent.user_queue)
Пример #13
0
    def test_load_csv_file(self):
        csv_path = self.get_fixture_path('experts.csv')
        document = Document.by_keys(collection=self.collection,
                                    foreign_id='experts.csv')
        db.session.commit()
        db.session.refresh(document)
        ingest_document(document, csv_path)
        assert Document.all().count() == 1, Document.all().count()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 14, len(records)
        rec0 = records[0]
        assert str(rec0.id) in repr(rec0), repr(rec0)
        assert 'nationality' in rec0.data, rec0.data
        assert 'name' in rec0.data, rec0.data

        doc = rec0.document
        doc.delete_records()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 0, len(records)
Пример #14
0
    def ingest_document(self, document, file_path=None,
                        role_id=None, shallow=False):
        """Ingest a database-backed document.

        First retrieve its data and then call the actual ingestor.
        """
        content_hash = document.content_hash
        if file_path is None and content_hash is not None:
            file_path = self.archive.load_file(content_hash, file_name=document.safe_file_name)  # noqa

        if file_path is not None and not os.path.exists(file_path):
            # Probably indicative of file system encoding issues.
            log.error("Invalid path [%r]: %s", document, file_path)
            return

        try:
            if document.collection is not None:
                if not len(document.languages):
                    document.languages = document.collection.languages

                if not len(document.countries):
                    document.countries = document.collection.countries

            result = DocumentResult(self, document,
                                    file_path=file_path,
                                    role_id=role_id)
            self.ingest(file_path, result=result)

            if not shallow and file_path is None:
                # When a directory is ingested, the data is not stored. Thus,
                # try to recurse transparently.
                for child in Document.by_parent(document):
                    from aleph.ingest import ingest_document
                    ingest_document(child, None, role_id=role_id)
        finally:
            db.session.rollback()
            if content_hash is not None:
                self.archive.cleanup_file(content_hash)