def _get_source_file(file_name): """Returns an existing or new (un-saved) source file model for the given file name :param file_name: The name of the source file :type file_name: string :returns: The source file model :rtype: :class:`source.models.SourceFile` """ try: src_file = SourceFile.objects.get(file_name=file_name) except SourceFile.DoesNotExist: src_file = SourceFile() # New file src_file.file_name = file_name src_file.is_deleted = True return src_file
def perform_ingest(ingest_id): """Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: int """ ingest = _get_ingest(ingest_id) file_name = ingest.file_name if ingest.status in ['INGESTED', 'DUPLICATE']: logger.warning('%s already marked %s, nothing to do', file_name, ingest.status) return _start_ingest(ingest) if ingest.status != 'INGESTING': return try: source_file = ingest.source_file if source_file.is_deleted: # Source file still marked as deleted, so we must copy/move/register the file source_file.set_basic_fields(file_name, ingest.file_size, ingest.media_type, ingest.get_data_type_tags()) source_file.update_uuid(file_name) # Add a stable identifier based on the file name source_file.workspace = ingest.workspace source_file.file_path = ingest.file_path source_file.is_deleted = False source_file.is_parsed = False source_file.deleted = None source_file.parsed = None if ingest.new_workspace: # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must # download the file and copy from there # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra # download is not necessary paths = ingest.workspace.get_file_system_paths([source_file]) if paths: local_path = paths[0] else: local_path = os.path.join('/tmp', file_name) file_download = FileDownload(source_file, local_path) ScaleFile.objects.download_files([file_download]) source_file.file_path = ingest.new_file_path if ingest.new_file_path else ingest.file_path logger.info('Copying %s in workspace %s to %s in workspace %s', ingest.file_path, ingest.workspace.name, source_file.file_path, ingest.new_workspace.name) file_upload = FileUpload(source_file, local_path) ScaleFile.objects.upload_files(ingest.new_workspace, [file_upload]) elif ingest.new_file_path: logger.info('Moving %s to %s in workspace %s', ingest.file_path, ingest.new_file_path, ingest.workspace.name) file_move = FileMove(source_file, ingest.new_file_path) ScaleFile.objects.move_files([file_move]) else: logger.info('Registering %s in workspace %s', ingest.file_path, ingest.workspace.name) _save_source_file(source_file) if ingest.new_workspace: # Copied file to new workspace, so delete file in old workspace (if workspace provides local path to do so) file_with_old_path = SourceFile() file_with_old_path.file_name = file_name file_with_old_path.file_path = ingest.file_path paths = ingest.workspace.get_file_system_paths([file_with_old_path]) if paths: _delete_file(paths[0]) except Exception: _complete_ingest(ingest, 'ERRORED') raise _complete_ingest(ingest, 'INGESTED') logger.info('Ingest successful for %s', file_name)
def perform_ingest(ingest_id): """Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: int """ ingest = _get_ingest(ingest_id) file_name = ingest.file_name if ingest.status in ['INGESTED', 'DUPLICATE']: logger.warning('%s already marked %s, nothing to do', file_name, ingest.status) return _start_ingest(ingest) if ingest.status != 'INGESTING': return try: source_file = ingest.source_file if source_file.is_deleted: # Source file still marked as deleted, so we must copy/move/register the file source_file.set_basic_fields(file_name, ingest.file_size, ingest.media_type, ingest.get_data_type_tags()) source_file.update_uuid( file_name) # Add a stable identifier based on the file name source_file.workspace = ingest.workspace source_file.file_path = ingest.file_path source_file.is_deleted = False source_file.is_parsed = False source_file.deleted = None source_file.parsed = None if ingest.new_workspace: # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must # download the file and copy from there # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra # download is not necessary paths = ingest.workspace.get_file_system_paths([source_file]) if paths: local_path = paths[0] else: local_path = os.path.join('/tmp', file_name) file_download = FileDownload(source_file, local_path, False) ScaleFile.objects.download_files([file_download]) source_file.file_path = ingest.new_file_path if ingest.new_file_path else ingest.file_path logger.info('Copying %s in workspace %s to %s in workspace %s', ingest.file_path, ingest.workspace.name, source_file.file_path, ingest.new_workspace.name) file_upload = FileUpload(source_file, local_path) ScaleFile.objects.upload_files(ingest.new_workspace, [file_upload]) elif ingest.new_file_path: logger.info('Moving %s to %s in workspace %s', ingest.file_path, ingest.new_file_path, ingest.workspace.name) file_move = FileMove(source_file, ingest.new_file_path) ScaleFile.objects.move_files([file_move]) else: logger.info('Registering %s in workspace %s', ingest.file_path, ingest.workspace.name) _save_source_file(source_file) if ingest.new_workspace: # Copied file to new workspace, so delete file in old workspace (if workspace provides local path to do so) file_with_old_path = SourceFile() file_with_old_path.file_name = file_name file_with_old_path.file_path = ingest.file_path paths = ingest.workspace.get_file_system_paths( [file_with_old_path]) if paths: _delete_file(paths[0]) except Exception: _complete_ingest(ingest, 'ERRORED') raise _complete_ingest(ingest, 'INGESTED') logger.info('Ingest successful for %s', file_name)