def perform_ingest(ingest_id, mount): """Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: int :param mount: The file system to mount in the form of host:/dir/path :type mount: string """ # TODO: refactor to combine _get_ingest(), _get_job_exe_id(), and _set_ingesting_status() in one database # transaction with as few queries as possible, include retries ingest = _get_ingest(ingest_id) job_exe_id = _get_job_exe_id(ingest) if not os.path.exists(SCALE_INGEST_MOUNT_PATH): logger.info('Creating %s', SCALE_INGEST_MOUNT_PATH) os.makedirs(SCALE_INGEST_MOUNT_PATH, mode=0755) dup_path = os.path.join(SCALE_INGEST_MOUNT_PATH, 'duplicate', ingest.file_name) ingest_path = os.path.join(SCALE_INGEST_MOUNT_PATH, ingest.ingest_path) nfs_mount(mount, SCALE_INGEST_MOUNT_PATH, read_only=False) try: # Check condition of the ingest ingest = _set_ingesting_status(ingest, ingest_path, dup_path) if ingest is None: return logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name) try: # TODO: future refactor: before copying file, grab existing source file (no lock) or create and save model # This guarantees that source file exists and can be used to check if file is duplicate # After this step, the source file should be marked as is_deleted so that it can't be used yet src_file = SourceFile.objects.store_file( ingest_path, ingest.get_data_type_tags(), ingest.workspace, ingest.file_path) _complete_ingest(ingest, 'INGESTED', src_file) _delete_ingest_file(ingest_path) logger.info('Ingest successful: %s', ingest_path) except DuplicateFile: logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True) # TODO: future refactor: pass source file model in so source files have duplicate ingests tied to them _complete_ingest(ingest, 'DUPLICATE', None) _move_ingest_file(ingest_path, dup_path) except Exception: # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method # TODO: future refactor: pass source file model in so source files have errored ingests tied to them # TODO: change ERRORED to FAILED _complete_ingest(ingest, 'ERRORED', None) raise # File remains where it is so it can be processed again finally: nfs_umount(SCALE_INGEST_MOUNT_PATH) try: cleanup_job_exe(job_exe_id) except Exception: logger.exception('Job Execution %i: Error cleaning up', job_exe_id)
def move_files(self, work_dir, files_to_move): """See :meth:`storage.brokers.broker.Broker.move_files` """ nfs_mount(self.mount, work_dir, False) try: for file_to_move in files_to_move: old_workspace_path = file_to_move[0] new_workspace_path = file_to_move[1] full_old_workspace_path = os.path.join(work_dir, old_workspace_path) full_new_workspace_path = os.path.join(work_dir, new_workspace_path) full_new_workspace_dir = os.path.dirname( full_new_workspace_path) if not os.path.exists(full_new_workspace_dir): logger.info('Creating %s', full_new_workspace_dir) os.makedirs(full_new_workspace_dir, mode=0755) logger.info('Moving %s to %s', full_old_workspace_path, full_new_workspace_path) shutil.move(full_old_workspace_path, full_new_workspace_path) os.chmod(full_new_workspace_path, 0644) finally: nfs_umount(work_dir)
def perform_ingest(ingest_id, mount): """Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: int :param mount: The file system to mount in the form of host:/dir/path :type mount: string """ # TODO: refactor to combine _get_ingest(), _get_job_exe_id(), and _set_ingesting_status() in one database # transaction with as few queries as possible, include retries ingest = _get_ingest(ingest_id) job_exe_id = _get_job_exe_id(ingest) if not os.path.exists(SCALE_INGEST_MOUNT_PATH): logger.info('Creating %s', SCALE_INGEST_MOUNT_PATH) os.makedirs(SCALE_INGEST_MOUNT_PATH, mode=0755) dup_path = os.path.join(SCALE_INGEST_MOUNT_PATH, 'duplicate', ingest.file_name) ingest_path = os.path.join(SCALE_INGEST_MOUNT_PATH, ingest.ingest_path) nfs_mount(mount, SCALE_INGEST_MOUNT_PATH, read_only=False) try: # Check condition of the ingest ingest = _set_ingesting_status(ingest, ingest_path, dup_path) if ingest is None: return logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name) try: # TODO: future refactor: before copying file, grab existing source file (no lock) or create and save model # This guarantees that source file exists and can be used to check if file is duplicate # After this step, the source file should be marked as is_deleted so that it can't be used yet src_file = SourceFile.objects.store_file(ingest_path, ingest.get_data_type_tags(), ingest.workspace, ingest.file_path) _complete_ingest(ingest, 'INGESTED', src_file) _delete_ingest_file(ingest_path) logger.info('Ingest successful: %s', ingest_path) except DuplicateFile: logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True) # TODO: future refactor: pass source file model in so source files have duplicate ingests tied to them _complete_ingest(ingest, 'DUPLICATE', None) _move_ingest_file(ingest_path, dup_path) except Exception: # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method # TODO: future refactor: pass source file model in so source files have errored ingests tied to them # TODO: change ERRORED to FAILED _complete_ingest(ingest, 'ERRORED', None) raise # File remains where it is so it can be processed again finally: nfs_umount(SCALE_INGEST_MOUNT_PATH) try: cleanup_job_exe(job_exe_id) except Exception: logger.exception('Job Execution %i: Error cleaning up', job_exe_id)
def delete_files(self, work_dir, workspace_paths): '''See :meth:`storage.brokers.broker.Broker.delete_files` ''' nfs_mount(self.mount, work_dir, False) try: for workspace_path in workspace_paths: path_to_delete = os.path.join(work_dir, workspace_path) if os.path.exists(path_to_delete): logger.info('Deleting %s', path_to_delete) os.remove(path_to_delete) finally: nfs_umount(work_dir)
def delete_files(self, work_dir, workspace_paths): """See :meth:`storage.brokers.broker.Broker.delete_files` """ nfs_mount(self.mount, work_dir, False) try: for workspace_path in workspace_paths: path_to_delete = os.path.join(work_dir, workspace_path) if os.path.exists(path_to_delete): logger.info('Deleting %s', path_to_delete) os.remove(path_to_delete) finally: nfs_umount(work_dir)
def mount_and_process_dir(self): '''Mounts NFS and processes the current files in the Strike directory ''' try: if not os.path.exists(self.strike_dir): logger.info('Creating %s', self.strike_dir) os.makedirs(self.strike_dir, mode=0755) nfs_mount(self.mount, self.strike_dir, read_only=False) self._init_dirs() self._process_dir() except Exception: logger.exception('Strike processor encountered error.') finally: nfs_umount(self.strike_dir)
def upload_files(self, upload_dir, work_dir, files_to_upload): '''See :meth:`storage.brokers.broker.Broker.setup_upload_dir` ''' nfs_mount(self.mount, work_dir, False) try: for file_to_upload in files_to_upload: src_path = file_to_upload[0] workspace_path = file_to_upload[1] full_src_path = os.path.join(upload_dir, src_path) full_workspace_path = os.path.join(work_dir, workspace_path) full_workspace_dir = os.path.dirname(full_workspace_path) if not os.path.exists(full_workspace_dir): logger.info('Creating %s', full_workspace_dir) os.makedirs(full_workspace_dir, mode=0755) self._copy_file(full_src_path, full_workspace_path) finally: nfs_umount(work_dir)
def upload_files(self, upload_dir, work_dir, files_to_upload): """See :meth:`storage.brokers.broker.Broker.setup_upload_dir` """ nfs_mount(self.mount, work_dir, False) try: for file_to_upload in files_to_upload: src_path = file_to_upload[0] workspace_path = file_to_upload[1] full_src_path = os.path.join(upload_dir, src_path) full_workspace_path = os.path.join(work_dir, workspace_path) full_workspace_dir = os.path.dirname(full_workspace_path) if not os.path.exists(full_workspace_dir): logger.info('Creating %s', full_workspace_dir) os.makedirs(full_workspace_dir, mode=0755) self._copy_file(full_src_path, full_workspace_path) os.chmod(full_workspace_path, 0644) finally: nfs_umount(work_dir)
def move_files(self, work_dir, files_to_move): '''See :meth:`storage.brokers.broker.Broker.move_files` ''' nfs_mount(self.mount, work_dir, False) try: for file_to_move in files_to_move: old_workspace_path = file_to_move[0] new_workspace_path = file_to_move[1] full_old_workspace_path = os.path.join(work_dir, old_workspace_path) full_new_workspace_path = os.path.join(work_dir, new_workspace_path) full_new_workspace_dir = os.path.dirname(full_new_workspace_path) if not os.path.exists(full_new_workspace_dir): logger.info('Creating %s', full_new_workspace_dir) os.makedirs(full_new_workspace_dir, mode=0755) logger.info('Moving %s to %s', full_old_workspace_path, full_new_workspace_path) shutil.move(full_old_workspace_path, full_new_workspace_path) finally: nfs_umount(work_dir)
def setup_download_dir(self, download_dir, work_dir): '''See :meth:`storage.brokers.broker.Broker.setup_download_dir` ''' nfs_mount(self.mount, work_dir, True)
def perform_ingest(ingest_id, mount): '''Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: long :param mount: The file system to mount in the form of host:/dir/path :type mount: str ''' job_exe_id = None upload_work_dir = None try: ingest = Ingest.objects.select_related().get(id=ingest_id) job_exe_id = JobExecution.objects.get_latest([ingest.job])[ingest.job.id].id ingest_work_dir = get_ingest_work_dir(job_exe_id) dup_path = os.path.join(ingest_work_dir, 'duplicate', ingest.file_name) ingest_path = os.path.join(ingest_work_dir, ingest.ingest_path) upload_work_dir = os.path.join(os.path.dirname(ingest_path), 'upload', str(ingest_id)) if not os.path.exists(ingest_work_dir): logger.info('Creating %s', ingest_work_dir) os.makedirs(ingest_work_dir, mode=0755) nfs_mount(mount, ingest_work_dir, read_only=False) # Check condition of the ingest ingest = _set_ingesting_status(ingest, ingest_path, dup_path) if ingest is None: return logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name) try: src_file = SourceFile.objects.store_file(upload_work_dir, ingest_path, ingest.get_data_type_tags(), ingest.workspace, ingest.file_path) # Atomically store file, mark INGESTED, and run ingest trigger rules with transaction.atomic(): # TODO: It's possible that the file will be successfully moved into the workspace but this database # transaction might fail. This will result in a file that is in a workspace but doesn't have database # entries. Attempts to re-ingest will result in duplicate file errors. logger.info('Marking file as INGESTED: %i', ingest_id) ingest.source_file = src_file ingest.status = 'INGESTED' ingest.ingest_ended = timezone.now() ingest.save() logger.debug('Checking ingest trigger rules') for ingest_rule in get_ingest_rules(): ingest_rule.process_ingest(ingest, src_file.id) # Delete ingest file _delete_ingest_file(ingest_path) logger.info('Ingest successful: %s', ingest_path) except DuplicateFile: logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True) ingest.status = 'DUPLICATE' ingest.save() _move_ingest_file(ingest_path, dup_path) except Exception: # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method ingest.status = 'ERRORED' ingest.save() raise # File remains where it is so it can be processed again finally: try: if upload_work_dir and os.path.exists(upload_work_dir): logger.info('Deleting %s', upload_work_dir) shutil.rmtree(upload_work_dir) except: # Swallow exception so error from main try block isn't covered up logger.exception('Failed to delete upload work dir %s', upload_work_dir) if job_exe_id: cleanup_job_exe(job_exe_id)
def perform_ingest(ingest_id, mount): '''Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: long :param mount: The file system to mount in the form of host:/dir/path :type mount: str ''' job_exe_id = None upload_work_dir = None try: # TODO: refactor to combine _get_ingest(), _get_job_exe_id(), and _set_ingesting_status() in one database # transaction with as few queries as possible, include retries ingest = _get_ingest(ingest_id) job_exe_id = _get_job_exe_id(ingest) create_job_exe_dir(job_exe_id) ingest_work_dir = get_ingest_work_dir(job_exe_id) dup_path = os.path.join(ingest_work_dir, 'duplicate', ingest.file_name) ingest_path = os.path.join(ingest_work_dir, ingest.ingest_path) upload_work_dir = os.path.join(os.path.dirname(ingest_path), 'upload', str(ingest_id)) if not os.path.exists(ingest_work_dir): logger.info('Creating %s', ingest_work_dir) os.makedirs(ingest_work_dir, mode=0755) nfs_mount(mount, ingest_work_dir, read_only=False) if not os.path.exists(upload_work_dir): logger.info('Creating %s', upload_work_dir) os.makedirs(upload_work_dir, mode=0755) # Check condition of the ingest ingest = _set_ingesting_status(ingest, ingest_path, dup_path) if ingest is None: return logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name) try: # TODO: future refactor: before copying file, grab existing source file (no lock) or create and save model # This guarantees that source file exists and can be used to check if file is duplicate # After this step, the source file should be marked as is_deleted so that it can't be used yet src_file = SourceFile.objects.store_file(upload_work_dir, ingest_path, ingest.get_data_type_tags(), ingest.workspace, ingest.file_path) _complete_ingest(ingest, 'INGESTED', src_file) _delete_ingest_file(ingest_path) logger.info('Ingest successful: %s', ingest_path) except DuplicateFile: logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True) # TODO: future refactor: pass source file model in so source files have duplicate ingests tied to them _complete_ingest(ingest, 'DUPLICATE', None) _move_ingest_file(ingest_path, dup_path) except Exception: # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method # TODO: future refactor: pass source file model in so source files have errored ingests tied to them # TODO: change ERRORED to FAILED _complete_ingest(ingest, 'ERRORED', None) raise # File remains where it is so it can be processed again finally: try: # Try to clean up the upload directory if upload_work_dir and os.path.exists(upload_work_dir): upload_dir = os.path.join(upload_work_dir, 'upload') workspace_work_dir = os.path.join(upload_work_dir, 'work') if os.path.exists(workspace_work_dir): ScaleFile.objects.cleanup_upload_dir(upload_dir, workspace_work_dir, ingest.workspace) logger.info('Deleting %s', workspace_work_dir) os.rmdir(workspace_work_dir) if os.path.exists(upload_dir): logger.info('Deleting %s', upload_dir) # Delete everything in upload dir shutil.rmtree(upload_dir) logger.info('Deleting %s', upload_work_dir) os.rmdir(upload_work_dir) except: # Swallow exception so error from main try block isn't covered up logger.exception('Failed to delete upload work dir %s', upload_work_dir) try: if job_exe_id: cleanup_job_exe(job_exe_id) except Exception: logger.exception('Job Execution %i: Error cleaning up', job_exe_id)
def setup_download_dir(self, download_dir, work_dir): """See :meth:`storage.brokers.broker.Broker.setup_download_dir` """ nfs_mount(self.mount, work_dir, True)