Пример #1
0
 def create_temp_dir(self, *args, **kwargs):
     """Creates a temporary folder and removes it later."""
     temp_dir = tempfile.mkdtemp(*args, **kwargs)
     try:
         yield decode_path(temp_dir)
     finally:
         remove_directory(temp_dir)
Пример #2
0
def ingest(document_id, file_path=None):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    # Work path will be used by storagelayer to cache a local
    # copy of data from an S3-based archive, and by ingestors
    # to perform processing and generate intermediary files.
    work_path = mkdtemp(prefix="aleph.ingest.")
    if file_path is None:
        file_path = archive.load_file(document.content_hash,
                                      file_name=document.safe_file_name,
                                      temp_path=work_path)

    try:
        manager = get_manager()
        result = DocumentResult(manager, document, file_path=file_path)
        get_manager().ingest(file_path, result=result, work_path=work_path)

        log.debug('Ingested [%s:%s]: %s', document.id, document.schema,
                  document.name)
        db.session.commit()
        process_document(document)
    except Exception:
        db.session.rollback()
        document = Document.by_id(document_id)
        log.exception("Ingest failed [%s]: %s", document.id, document.name)
    finally:
        # Removing the temp_path given to storagelayer makes it redundant
        # to also call cleanup on the archive.
        remove_directory(work_path)
Пример #3
0
    def ingest_document(self,
                        document,
                        file_path=None,
                        role_id=None,
                        shallow=False):
        """Ingest a database-backed document.

        First retrieve its data and then call the actual ingestor.
        """
        # Work path will be used by storagelayer to cache a local
        # copy of data from an S3-based archive, and by ingestors
        # to perform processing and generate intermediary files.
        work_path = mkdtemp(prefix="aleph.ingest.")
        content_hash = document.content_hash
        if file_path is None and content_hash is not None:
            file_name = document.safe_file_name
            file_path = self.archive.load_file(content_hash,
                                               file_name=file_name,
                                               temp_path=work_path)

        if file_path is not None and not os.path.exists(file_path):
            # Probably indicative of file system encoding issues.
            log.error("Invalid path [%r]: %s", document, file_path)
            return

        try:
            if not len(document.languages):
                document.languages = document.collection.languages

            if not len(document.countries):
                document.countries = document.collection.countries

            result = DocumentResult(self,
                                    document,
                                    file_path=file_path,
                                    role_id=role_id)
            self.ingest(file_path, result=result)

            if not shallow and file_path is None:
                # When a directory is ingested, the data is not stored. Thus,
                # try to recurse on the database-backed known children.
                for child in Document.by_parent(document):
                    from aleph.ingest import ingest_document
                    ingest_document(child, None, role_id=role_id)
        finally:
            db.session.rollback()
            # Removing the temp_path given to storagelayer makes it redundant
            # to also call cleanup on the archive.
            remove_directory(work_path)
Пример #4
0
def ingest(document_id, file_path=None, refresh=False):
    """Process a given document by extracting its contents.
    This may include creating or updating child documents."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    # Work path will be used by storagelayer to cache a local
    # copy of data from an S3-based archive, and by ingestors
    # to perform processing and generate intermediary files.
    work_path = mkdtemp(prefix="aleph.ingest.")
    if file_path is None:
        file_path = archive.load_file(document.content_hash,
                                      file_name=document.safe_file_name,
                                      temp_path=work_path)

    try:
        manager = get_manager()
        result = DocumentResult(manager, document, file_path=file_path)
        get_manager().ingest(file_path, result=result, work_path=work_path)

        document.status = Document.STATUS_SUCCESS
        log.debug('Ingested [%s:%s]: %s', document.id, document.schema,
                  document.name)

        if document.collection.casefile and not refresh:
            params = {'collection': document.collection, 'document': document}
            publish(Events.INGEST_DOCUMENT,
                    actor_id=document.uploader_id,
                    params=params)

        db.session.commit()
    except Exception:
        db.session.rollback()
        document = Document.by_id(document_id)
        log.exception("Ingest failed [%s]: %s", document.id, document.name)
        document.status = Document.STATUS_FAIL
        db.session.commit()
    finally:
        # Removing the temp_path given to storagelayer makes it redundant
        # to also call cleanup on the archive.
        remove_directory(work_path)

    extract_document_tags(document)
    # delete_entity(document.id, exclude=document.schema)
    index_document(document)
    refresh_entity(document)
Пример #5
0
 def extract_message(self, zipf, name):
     if 'message_' not in name or not name.endswith('.xml'):
         return
     parent = self.extract_hierarchy(name)
     message_dir = self.make_empty_directory()
     try:
         xml_path = self.extract_file(zipf, name, message_dir)
         foreign_id = os.path.join(self.result.id, name)
         message = self.manager.handle_child(parent,
                                             xml_path,
                                             id=foreign_id,
                                             mime_type=MIME)
         try:
             doc = self.parse_xml(xml_path)
             for el in doc.findall('.//messageAttachment'):
                 self.extract_attachment(zipf, message, el, message_dir)
         except TypeError:
             pass  # this will be reported for the individual file.
     finally:
         remove_directory(message_dir)
Пример #6
0
 def close(self):
     self.writer.flush()
     remove_directory(self.work_path)
Пример #7
0
 def finalize(self, entity):
     self.emit_entity(entity)
     self.writer.flush()
     remove_directory(self.work_path)
Пример #8
0
 def cleanup(self):
     remove_directory(self.work_path)