def __init__(self, strike_id, job_exe_id, configuration): '''Constructor :param strike_id: The ID of the Strike process :type strike_id: int :param job_exe_id: The ID of the job execution :type job_exe_id: int :param configuration: The Strike configuration :type configuration: :class:`ingest.strike.configuration.strike_configuration.StrikeConfiguration` ''' self.strike_id = strike_id self.job_exe_id = job_exe_id self.configuration = configuration self.mount = None self.strike_dir = get_ingest_work_dir(job_exe_id) self.rel_deferred_dir = 'deferred' self.rel_duplicate_dir = 'duplicate' self.rel_ingest_dir = 'ingesting' self.deferred_dir = os.path.join(self.strike_dir, self.rel_deferred_dir) self.duplicate_dir = os.path.join(self.strike_dir, self.rel_duplicate_dir) self.ingest_dir = os.path.join(self.strike_dir, self.rel_ingest_dir) self.load_configuration(configuration)
def setUp(self): django.setup() self.ingest = ingest_test_utils.create_ingest(file_name='my_file.txt') self.mount = 'host:/path' self.mount_on = os.path.join('my', 'test') self.workspace = storage_test_utils.create_workspace() self.config = StrikeConfiguration({ 'version': '1.0', 'mount': self.mount, 'transfer_suffix': '_tmp', 'files_to_ingest': [{ 'filename_regex': '.*txt', 'workspace_path': 'foo', 'workspace_name': self.workspace.name, }], }) self.job_exe = job_test_utils.create_job_exe() self.strike_proc = StrikeProcessor(1, self.job_exe.id, self.config) self.strike_dir = get_ingest_work_dir(self.job_exe.id)
def cleanup_job_execution(self, job_exe): '''See :meth:`job.execution.job_exe_cleaner.JobExecutionCleaner.cleanup_job_execution` ''' logger.info('Cleaning up a Strike job') ingest_work_dir = get_ingest_work_dir(job_exe.id) if os.path.exists(ingest_work_dir): nfs_umount(ingest_work_dir)
def cleanup_job_execution(self, job_exe): '''See :meth:`job.execution.job_exe_cleaner.JobExecutionCleaner.cleanup_job_execution` ''' logger.info('Cleaning up a Strike job') ingest_work_dir = get_ingest_work_dir(job_exe.id) if os.path.exists(ingest_work_dir): nfs_umount(ingest_work_dir) logger.info('Deleting %s', ingest_work_dir) os.rmdir(ingest_work_dir)
def perform_ingest(ingest_id, mount): '''Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: long :param mount: The file system to mount in the form of host:/dir/path :type mount: str ''' job_exe_id = None upload_work_dir = None try: ingest = Ingest.objects.select_related().get(id=ingest_id) job_exe_id = JobExecution.objects.get_latest([ingest.job])[ingest.job.id].id ingest_work_dir = get_ingest_work_dir(job_exe_id) dup_path = os.path.join(ingest_work_dir, 'duplicate', ingest.file_name) ingest_path = os.path.join(ingest_work_dir, ingest.ingest_path) upload_work_dir = os.path.join(os.path.dirname(ingest_path), 'upload', str(ingest_id)) if not os.path.exists(ingest_work_dir): logger.info('Creating %s', ingest_work_dir) os.makedirs(ingest_work_dir, mode=0755) nfs_mount(mount, ingest_work_dir, read_only=False) # Check condition of the ingest ingest = _set_ingesting_status(ingest, ingest_path, dup_path) if ingest is None: return logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name) try: src_file = SourceFile.objects.store_file(upload_work_dir, ingest_path, ingest.get_data_type_tags(), ingest.workspace, ingest.file_path) # Atomically store file, mark INGESTED, and run ingest trigger rules with transaction.atomic(): # TODO: It's possible that the file will be successfully moved into the workspace but this database # transaction might fail. This will result in a file that is in a workspace but doesn't have database # entries. Attempts to re-ingest will result in duplicate file errors. logger.info('Marking file as INGESTED: %i', ingest_id) ingest.source_file = src_file ingest.status = 'INGESTED' ingest.ingest_ended = timezone.now() ingest.save() logger.debug('Checking ingest trigger rules') for ingest_rule in get_ingest_rules(): ingest_rule.process_ingest(ingest, src_file.id) # Delete ingest file _delete_ingest_file(ingest_path) logger.info('Ingest successful: %s', ingest_path) except DuplicateFile: logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True) ingest.status = 'DUPLICATE' ingest.save() _move_ingest_file(ingest_path, dup_path) except Exception: # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method ingest.status = 'ERRORED' ingest.save() raise # File remains where it is so it can be processed again finally: try: if upload_work_dir and os.path.exists(upload_work_dir): logger.info('Deleting %s', upload_work_dir) shutil.rmtree(upload_work_dir) except: # Swallow exception so error from main try block isn't covered up logger.exception('Failed to delete upload work dir %s', upload_work_dir) if job_exe_id: cleanup_job_exe(job_exe_id)
def perform_ingest(ingest_id, mount): '''Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: long :param mount: The file system to mount in the form of host:/dir/path :type mount: str ''' job_exe_id = None upload_work_dir = None try: # TODO: refactor to combine _get_ingest(), _get_job_exe_id(), and _set_ingesting_status() in one database # transaction with as few queries as possible, include retries ingest = _get_ingest(ingest_id) job_exe_id = _get_job_exe_id(ingest) create_job_exe_dir(job_exe_id) ingest_work_dir = get_ingest_work_dir(job_exe_id) dup_path = os.path.join(ingest_work_dir, 'duplicate', ingest.file_name) ingest_path = os.path.join(ingest_work_dir, ingest.ingest_path) upload_work_dir = os.path.join(os.path.dirname(ingest_path), 'upload', str(ingest_id)) if not os.path.exists(ingest_work_dir): logger.info('Creating %s', ingest_work_dir) os.makedirs(ingest_work_dir, mode=0755) nfs_mount(mount, ingest_work_dir, read_only=False) if not os.path.exists(upload_work_dir): logger.info('Creating %s', upload_work_dir) os.makedirs(upload_work_dir, mode=0755) # Check condition of the ingest ingest = _set_ingesting_status(ingest, ingest_path, dup_path) if ingest is None: return logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name) try: # TODO: future refactor: before copying file, grab existing source file (no lock) or create and save model # This guarantees that source file exists and can be used to check if file is duplicate # After this step, the source file should be marked as is_deleted so that it can't be used yet src_file = SourceFile.objects.store_file(upload_work_dir, ingest_path, ingest.get_data_type_tags(), ingest.workspace, ingest.file_path) _complete_ingest(ingest, 'INGESTED', src_file) _delete_ingest_file(ingest_path) logger.info('Ingest successful: %s', ingest_path) except DuplicateFile: logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True) # TODO: future refactor: pass source file model in so source files have duplicate ingests tied to them _complete_ingest(ingest, 'DUPLICATE', None) _move_ingest_file(ingest_path, dup_path) except Exception: # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method # TODO: future refactor: pass source file model in so source files have errored ingests tied to them # TODO: change ERRORED to FAILED _complete_ingest(ingest, 'ERRORED', None) raise # File remains where it is so it can be processed again finally: try: # Try to clean up the upload directory if upload_work_dir and os.path.exists(upload_work_dir): upload_dir = os.path.join(upload_work_dir, 'upload') workspace_work_dir = os.path.join(upload_work_dir, 'work') if os.path.exists(workspace_work_dir): ScaleFile.objects.cleanup_upload_dir(upload_dir, workspace_work_dir, ingest.workspace) logger.info('Deleting %s', workspace_work_dir) os.rmdir(workspace_work_dir) if os.path.exists(upload_dir): logger.info('Deleting %s', upload_dir) # Delete everything in upload dir shutil.rmtree(upload_dir) logger.info('Deleting %s', upload_work_dir) os.rmdir(upload_work_dir) except: # Swallow exception so error from main try block isn't covered up logger.exception('Failed to delete upload work dir %s', upload_work_dir) try: if job_exe_id: cleanup_job_exe(job_exe_id) except Exception: logger.exception('Job Execution %i: Error cleaning up', job_exe_id)