Пример #1
0
    def __init__(self, strike_id, job_exe_id, configuration):
        '''Constructor

        :param strike_id: The ID of the Strike process
        :type strike_id: int
        :param job_exe_id: The ID of the job execution
        :type job_exe_id: int
        :param configuration: The Strike configuration
        :type configuration: :class:`ingest.strike.configuration.strike_configuration.StrikeConfiguration`
        '''

        self.strike_id = strike_id
        self.job_exe_id = job_exe_id
        self.configuration = configuration
        self.mount = None

        self.strike_dir = get_ingest_work_dir(job_exe_id)
        self.rel_deferred_dir = 'deferred'
        self.rel_duplicate_dir = 'duplicate'
        self.rel_ingest_dir = 'ingesting'
        self.deferred_dir = os.path.join(self.strike_dir, self.rel_deferred_dir)
        self.duplicate_dir = os.path.join(self.strike_dir, self.rel_duplicate_dir)
        self.ingest_dir = os.path.join(self.strike_dir, self.rel_ingest_dir)

        self.load_configuration(configuration)
Пример #2
0
    def __init__(self, strike_id, job_exe_id, configuration):
        '''Constructor

        :param strike_id: The ID of the Strike process
        :type strike_id: int
        :param job_exe_id: The ID of the job execution
        :type job_exe_id: int
        :param configuration: The Strike configuration
        :type configuration: :class:`ingest.strike.configuration.strike_configuration.StrikeConfiguration`
        '''

        self.strike_id = strike_id
        self.job_exe_id = job_exe_id
        self.configuration = configuration
        self.mount = None

        self.strike_dir = get_ingest_work_dir(job_exe_id)
        self.rel_deferred_dir = 'deferred'
        self.rel_duplicate_dir = 'duplicate'
        self.rel_ingest_dir = 'ingesting'
        self.deferred_dir = os.path.join(self.strike_dir, self.rel_deferred_dir)
        self.duplicate_dir = os.path.join(self.strike_dir, self.rel_duplicate_dir)
        self.ingest_dir = os.path.join(self.strike_dir, self.rel_ingest_dir)

        self.load_configuration(configuration)
Пример #3
0
    def setUp(self):
        django.setup()

        self.ingest = ingest_test_utils.create_ingest(file_name='my_file.txt')

        self.mount = 'host:/path'
        self.mount_on = os.path.join('my', 'test')
        self.workspace = storage_test_utils.create_workspace()
        self.config = StrikeConfiguration({
            'version':
            '1.0',
            'mount':
            self.mount,
            'transfer_suffix':
            '_tmp',
            'files_to_ingest': [{
                'filename_regex': '.*txt',
                'workspace_path': 'foo',
                'workspace_name': self.workspace.name,
            }],
        })
        self.job_exe = job_test_utils.create_job_exe()

        self.strike_proc = StrikeProcessor(1, self.job_exe.id, self.config)
        self.strike_dir = get_ingest_work_dir(self.job_exe.id)
Пример #4
0
    def cleanup_job_execution(self, job_exe):
        '''See :meth:`job.execution.job_exe_cleaner.JobExecutionCleaner.cleanup_job_execution`
        '''

        logger.info('Cleaning up a Strike job')

        ingest_work_dir = get_ingest_work_dir(job_exe.id)

        if os.path.exists(ingest_work_dir):
            nfs_umount(ingest_work_dir)
Пример #5
0
    def cleanup_job_execution(self, job_exe):
        '''See :meth:`job.execution.job_exe_cleaner.JobExecutionCleaner.cleanup_job_execution`
        '''

        logger.info('Cleaning up a Strike job')

        ingest_work_dir = get_ingest_work_dir(job_exe.id)

        if os.path.exists(ingest_work_dir):
            nfs_umount(ingest_work_dir)
            logger.info('Deleting %s', ingest_work_dir)
            os.rmdir(ingest_work_dir)
Пример #6
0
    def setUp(self):
        django.setup()

        self.ingest = ingest_test_utils.create_ingest(file_name='my_file.txt')

        self.mount = 'host:/path'
        self.mount_on = os.path.join('my', 'test')
        self.workspace = storage_test_utils.create_workspace()
        self.config = StrikeConfiguration({
            'version': '1.0',
            'mount': self.mount,
            'transfer_suffix': '_tmp',
            'files_to_ingest': [{
                'filename_regex': '.*txt',
                'workspace_path': 'foo',
                'workspace_name': self.workspace.name,
            }],
        })
        self.job_exe = job_test_utils.create_job_exe()

        self.strike_proc = StrikeProcessor(1, self.job_exe.id, self.config)
        self.strike_dir = get_ingest_work_dir(self.job_exe.id)
Пример #7
0
def perform_ingest(ingest_id, mount):
    '''Performs the ingest for the given ingest ID

    :param ingest_id: The ID of the ingest to perform
    :type ingest_id: long
    :param mount: The file system to mount in the form of host:/dir/path
    :type mount: str
    '''

    job_exe_id = None
    upload_work_dir = None
    try:
        ingest = Ingest.objects.select_related().get(id=ingest_id)
        job_exe_id = JobExecution.objects.get_latest([ingest.job])[ingest.job.id].id
        ingest_work_dir = get_ingest_work_dir(job_exe_id)
        dup_path = os.path.join(ingest_work_dir, 'duplicate', ingest.file_name)
        ingest_path = os.path.join(ingest_work_dir, ingest.ingest_path)
        upload_work_dir = os.path.join(os.path.dirname(ingest_path), 'upload', str(ingest_id))
        if not os.path.exists(ingest_work_dir):
            logger.info('Creating %s', ingest_work_dir)
            os.makedirs(ingest_work_dir, mode=0755)
        nfs_mount(mount, ingest_work_dir, read_only=False)

        # Check condition of the ingest
        ingest = _set_ingesting_status(ingest, ingest_path, dup_path)
        if ingest is None:
            return

        logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name)
        try:
            src_file = SourceFile.objects.store_file(upload_work_dir, ingest_path, ingest.get_data_type_tags(),
                                                     ingest.workspace, ingest.file_path)
            # Atomically store file, mark INGESTED, and run ingest trigger rules
            with transaction.atomic():
                # TODO: It's possible that the file will be successfully moved into the workspace but this database
                # transaction might fail. This will result in a file that is in a workspace but doesn't have database
                # entries. Attempts to re-ingest will result in duplicate file errors.
                logger.info('Marking file as INGESTED: %i', ingest_id)
                ingest.source_file = src_file
                ingest.status = 'INGESTED'
                ingest.ingest_ended = timezone.now()
                ingest.save()
                logger.debug('Checking ingest trigger rules')
                for ingest_rule in get_ingest_rules():
                    ingest_rule.process_ingest(ingest, src_file.id)

            # Delete ingest file
            _delete_ingest_file(ingest_path)
            logger.info('Ingest successful: %s', ingest_path)
        except DuplicateFile:
            logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True)
            ingest.status = 'DUPLICATE'
            ingest.save()
            _move_ingest_file(ingest_path, dup_path)
        except Exception:
            # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method
            ingest.status = 'ERRORED'
            ingest.save()
            raise  # File remains where it is so it can be processed again
    finally:
        try:
            if upload_work_dir and os.path.exists(upload_work_dir):
                logger.info('Deleting %s', upload_work_dir)
                shutil.rmtree(upload_work_dir)
        except:
            # Swallow exception so error from main try block isn't covered up
            logger.exception('Failed to delete upload work dir %s', upload_work_dir)

        if job_exe_id:
            cleanup_job_exe(job_exe_id)
Пример #8
0
def perform_ingest(ingest_id, mount):
    '''Performs the ingest for the given ingest ID

    :param ingest_id: The ID of the ingest to perform
    :type ingest_id: long
    :param mount: The file system to mount in the form of host:/dir/path
    :type mount: str
    '''

    job_exe_id = None
    upload_work_dir = None
    try:
        # TODO: refactor to combine _get_ingest(), _get_job_exe_id(), and _set_ingesting_status() in one database
        # transaction with as few queries as possible, include retries
        ingest = _get_ingest(ingest_id)
        job_exe_id = _get_job_exe_id(ingest)
        create_job_exe_dir(job_exe_id)
        ingest_work_dir = get_ingest_work_dir(job_exe_id)
        dup_path = os.path.join(ingest_work_dir, 'duplicate', ingest.file_name)
        ingest_path = os.path.join(ingest_work_dir, ingest.ingest_path)
        upload_work_dir = os.path.join(os.path.dirname(ingest_path), 'upload', str(ingest_id))
        if not os.path.exists(ingest_work_dir):
            logger.info('Creating %s', ingest_work_dir)
            os.makedirs(ingest_work_dir, mode=0755)
        nfs_mount(mount, ingest_work_dir, read_only=False)
        if not os.path.exists(upload_work_dir):
            logger.info('Creating %s', upload_work_dir)
            os.makedirs(upload_work_dir, mode=0755)

        # Check condition of the ingest
        ingest = _set_ingesting_status(ingest, ingest_path, dup_path)
        if ingest is None:
            return

        logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name)
        try:
            # TODO: future refactor: before copying file, grab existing source file (no lock) or create and save model
            # This guarantees that source file exists and can be used to check if file is duplicate
            # After this step, the source file should be marked as is_deleted so that it can't be used yet
            src_file = SourceFile.objects.store_file(upload_work_dir, ingest_path, ingest.get_data_type_tags(),
                                                     ingest.workspace, ingest.file_path)

            _complete_ingest(ingest, 'INGESTED', src_file)
            _delete_ingest_file(ingest_path)
            logger.info('Ingest successful: %s', ingest_path)
        except DuplicateFile:
            logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True)
            # TODO: future refactor: pass source file model in so source files have duplicate ingests tied to them
            _complete_ingest(ingest, 'DUPLICATE', None)
            _move_ingest_file(ingest_path, dup_path)
        except Exception:
            # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method
            # TODO: future refactor: pass source file model in so source files have errored ingests tied to them
            # TODO: change ERRORED to FAILED
            _complete_ingest(ingest, 'ERRORED', None)
            raise  # File remains where it is so it can be processed again
    finally:
        try:
            # Try to clean up the upload directory
            if upload_work_dir and os.path.exists(upload_work_dir):
                upload_dir = os.path.join(upload_work_dir, 'upload')
                workspace_work_dir = os.path.join(upload_work_dir, 'work')
                if os.path.exists(workspace_work_dir):
                    ScaleFile.objects.cleanup_upload_dir(upload_dir, workspace_work_dir, ingest.workspace)
                    logger.info('Deleting %s', workspace_work_dir)
                    os.rmdir(workspace_work_dir)
                if os.path.exists(upload_dir):
                    logger.info('Deleting %s', upload_dir)
                    # Delete everything in upload dir
                    shutil.rmtree(upload_dir)
                logger.info('Deleting %s', upload_work_dir)
                os.rmdir(upload_work_dir)
        except:
            # Swallow exception so error from main try block isn't covered up
            logger.exception('Failed to delete upload work dir %s', upload_work_dir)

    try:
        if job_exe_id:
            cleanup_job_exe(job_exe_id)
    except Exception:
        logger.exception('Job Execution %i: Error cleaning up', job_exe_id)