def test_success(self, mock_getsize): """Tests calling ScaleFileManager.upload_files() successfully""" def new_getsize(path): return 100 mock_getsize.side_effect = new_getsize workspace = storage_test_utils.create_workspace() file_1 = ScaleFile() file_1.set_basic_fields('file.txt', 100, None) # Scale should auto-detect text/plain remote_path_1 = 'my/remote/path/file.txt' local_path_1 = 'my/local/path/file.txt' file_1.file_path = remote_path_1 file_2 = ScaleFile() file_2.set_basic_fields('file.json', 100, 'application/json') remote_path_2 = 'my/remote/path/2/file.json' local_path_2 = 'my/local/path/2/file.json' file_2.file_path = remote_path_2 workspace.upload_files = MagicMock() files = [FileUpload(file_1, local_path_1), FileUpload(file_2, local_path_2)] models = ScaleFile.objects.upload_files(workspace, files) workspace.upload_files.assert_called_once_with([FileUpload(file_1, local_path_1), FileUpload(file_2, local_path_2)]) self.assertEqual('file.txt', models[0].file_name) self.assertEqual(remote_path_1, models[0].file_path) self.assertEqual('text/plain', models[0].media_type) self.assertEqual(workspace.id, models[0].workspace_id) self.assertEqual('file.json', models[1].file_name) self.assertEqual(remote_path_2, models[1].file_path) self.assertEqual('application/json', models[1].media_type) self.assertEqual(workspace.id, models[1].workspace_id)
def test_successfully(self, mock_copy, mock_chmod, mock_exists, mock_makedirs): """Tests calling HostBroker.upload_files() successfully""" def new_exists(path): return False mock_exists.side_effect = new_exists volume_path = os.path.join('the', 'volume', 'path') file_name_1 = 'my_file.txt' file_name_2 = 'my_file.json' local_path_file_1 = os.path.join('my_dir_1', file_name_1) local_path_file_2 = os.path.join('my_dir_2', file_name_2) workspace_path_file_1 = os.path.join('my_wrk_dir_1', file_name_1) workspace_path_file_2 = os.path.join('my_wrk_dir_2', file_name_2) full_workspace_path_file_1 = os.path.join(volume_path, workspace_path_file_1) full_workspace_path_file_2 = os.path.join(volume_path, workspace_path_file_2) file_1 = storage_test_utils.create_file(file_path=workspace_path_file_1) file_2 = storage_test_utils.create_file(file_path=workspace_path_file_2) file_1_up = FileUpload(file_1, local_path_file_1) file_2_up = FileUpload(file_2, local_path_file_2) # Call method to test self.broker.upload_files(volume_path, [file_1_up, file_2_up]) # Check results two_calls = [call(os.path.dirname(full_workspace_path_file_1), mode=0755), call(os.path.dirname(full_workspace_path_file_2), mode=0755)] mock_makedirs.assert_has_calls(two_calls) two_calls = [call(local_path_file_1, full_workspace_path_file_1), call(local_path_file_2, full_workspace_path_file_2)] mock_copy.assert_has_calls(two_calls) two_calls = [call(full_workspace_path_file_1, 0644), call(full_workspace_path_file_2, 0644)] mock_chmod.assert_has_calls(two_calls)
def test_upload_files(self, mock_client_class): """Tests uploading files successfully""" s3_object_1 = MagicMock() s3_object_2 = MagicMock() mock_client = MagicMock(S3Client) mock_client.get_object.side_effect = [s3_object_1, s3_object_2] mock_client_class.return_value.__enter__ = Mock(return_value=mock_client) file_name_1 = 'my_file.txt' file_name_2 = 'my_file.json' local_path_file_1 = os.path.join('my_dir_1', file_name_1) local_path_file_2 = os.path.join('my_dir_2', file_name_2) workspace_path_file_1 = os.path.join('my_wrk_dir_1', file_name_1) workspace_path_file_2 = os.path.join('my_wrk_dir_2', file_name_2) file_1 = storage_test_utils.create_file(file_path=workspace_path_file_1, media_type='text/plain') file_2 = storage_test_utils.create_file(file_path=workspace_path_file_2, media_type='application/json') file_1_up = FileUpload(file_1, local_path_file_1) file_2_up = FileUpload(file_2, local_path_file_2) # Call method to test mo = mock_open() with patch('__builtin__.open', mo, create=True): self.broker.upload_files(None, [file_1_up, file_2_up]) # Check results self.assertTrue(s3_object_1.upload_file.called) self.assertTrue(s3_object_2.upload_file.called) self.assertEqual(s3_object_1.upload_file.call_args[0][1]['ContentType'], 'text/plain') self.assertEqual(s3_object_2.upload_file.call_args[0][1]['ContentType'], 'application/json')
def handle(self, *args, **options): """See :meth:`django.core.management.base.BaseCommand.handle`. This method starts the file upload process. """ file_id = options.get('file_id') remote_path = options.get('remote_path') workspace_name = options.get('workspace') logger.info('Command starting: scale_upload_file') logger.info(' - Workspace: %s', workspace_name) # Validate the file paths file_name = os.path.basename(local_path) if not os.path.exists(local_path): logger.exception('Local file does not exist: %s', local_path) sys.exit(1) # Attempt to fetch the workspace model try: workspace = Workspace.objects.get(name=workspace_name) except Workspace.DoesNotExist: logger.exception('Workspace does not exist: %s', workspace_name) sys.exit(1) # Attempt to set up a file model try: scale_file = ScaleFile.objects.get(file_name=file_name) except ScaleFile.DoesNotExist: scale_file = ScaleFile() scale_file.update_uuid(file_name) scale_file.file_path = remote_path try: ScaleFile.objects.upload_files( workspace, [FileUpload(scale_file, local_path)]) except: logger.exception('Unknown error occurred, exit code 1 returning') sys.exit(1) logger.info('Command completed: scale_upload_file')
def store_file(self, local_path, data_types, workspace, remote_path): """Stores the given local source file in the workspace :param local_path: The absolute local path of the source file to store :type local_path: str :param data_types: The data type tags of the source file :type data_types: list of str :param workspace: The workspace to use for storing the source file :type workspace: :class:`storage.models.Workspace` :param remote_path: The relative path for storing the source file :type remote_path: str :returns: The model of the saved source file :rtype: :class:`source.models.SourceFile` """ file_name = os.path.basename(local_path) # Check for duplicate file, else create new file # TODO: fix race condition with many files with same name? try: src_file = SourceFile.objects.get(file_name=file_name) # Duplicate files that are deleted should be stored again if not src_file.is_deleted: raise DuplicateFile('\'%s\' already exists' % file_name) except SourceFile.DoesNotExist: src_file = SourceFile() # New file # Add a stable identifier based on the file name src_file.update_uuid(file_name) # Add tags and store the new/updated source file for tag in data_types: src_file.add_data_type_tag(tag) src_file.file_path = remote_path return ScaleFile.objects.upload_files( workspace, [FileUpload(src_file, local_path)])[0]
def upload_files(self, file_entries, input_file_ids, job_exe, workspace): """Uploads the given local product files into the workspace. :param file_entries: List of files to upload :type file_entries: list[:class:`product.types.ProductFileMetadata`] :param input_file_ids: List of identifiers for files used to produce the given file entries :type input_file_ids: list of int :param job_exe: The job_exe model with the related job and job_type fields :type job_exe: :class:`job.models.JobExecution` :param workspace: The workspace to use for storing the product files :type workspace: :class:`storage.models.Workspace` :returns: The list of the saved product models :rtype: list of :class:`storage.models.ScaleFile` """ # Build a list of UUIDs for the input files input_files = ScaleFile.objects.filter(pk__in=input_file_ids).values( 'uuid', 'id').order_by('uuid') input_file_uuids = [f['uuid'] for f in input_files] # Get property names and values as strings properties = job_exe.job.get_job_data().get_all_properties() # Product UUID will be based in part on input data (UUIDs of input files and name/value pairs of input # properties) input_strings = input_file_uuids input_strings.extend(properties) # Determine if any input files are non-operational products input_products = ScaleFile.objects.filter( id__in=[f['id'] for f in input_files], file_type='PRODUCT') input_products_operational = all( [f.is_operational for f in input_products]) source_started = job_exe.job.source_started source_ended = job_exe.job.source_ended source_sensor_class = job_exe.job.source_sensor_class source_sensor = job_exe.job.source_sensor source_collection = job_exe.job.source_collection source_task = job_exe.job.source_task if not source_started: # Compute the overall start and stop times for all file_entries source_files = FileAncestryLink.objects.get_source_ancestors( [f['id'] for f in input_files]) start_times = [f.data_started for f in source_files] end_times = [f.data_ended for f in source_files] start_times.sort() end_times.sort(reverse=True) if start_times: source_started = start_times[0] if end_times: source_ended = end_times[0] products_to_save = [] for entry in file_entries: product = ProductFile.create() product.job_exe = job_exe product.job = job_exe.job product.job_type = job_exe.job.job_type product.is_operational = input_products_operational and job_exe.job.job_type.is_operational file_name = os.path.basename(entry.local_path) file_size = os.path.getsize(entry.local_path) product.set_basic_fields(file_name, file_size, entry.media_type) product.file_path = entry.remote_path product.job_output = entry.output_name # Add a stable identifier based on the job type, input files, input properties, and file name # This is designed to remain stable across re-processing the same type of job on the same inputs product.update_uuid(job_exe.job.job_type.id, file_name, *input_strings) # Add temporal info to product if available if entry.data_start: product.data_started = parse_datetime(entry.data_start) if entry.data_end: product.data_ended = parse_datetime(entry.data_end) if entry.geojson: geom, props = geo_utils.parse_geo_json(entry.geojson) product.geometry = geom if props: product.meta_data = props product.center_point = geo_utils.get_center_point(geom) # Add recipe info to product if available. job_recipe = Recipe.objects.get_recipe_for_job(job_exe.job_id) if job_recipe: product.recipe_id = job_recipe.recipe.id product.recipe_type = job_recipe.recipe.recipe_type product.recipe_node = job_recipe.node_name # Add batch info to product if available. try: from batch.models import BatchJob product.batch_id = BatchJob.objects.get( job_id=job_exe.job_id).batch_id except BatchJob.DoesNotExist: product.batch_id = None # Allow override, if set via side-car metadata, otherwise take derived values from above product.source_started = entry.source_started if entry.source_started else source_started product.source_ended = entry.source_ended if entry.source_ended else source_ended # Supplemental source metadata product.source_sensor_class = entry.source_sensor_class if entry.source_sensor_class else source_sensor_class product.source_sensor = entry.source_sensor if entry.source_sensor else source_sensor product.source_collection = entry.source_collection if entry.source_collection else source_collection product.source_task = entry.source_task if entry.source_task else source_task # Update product model with details derived from the job_type product.meta_data['url'] = product.url product.meta_data['job_name'] = job_exe.job_type.name product.meta_data[ 'job_version'] = job_exe.job_type.get_job_version() product.meta_data[ 'package_version'] = job_exe.job_type.get_package_version() products_to_save.append(FileUpload(product, entry.local_path)) return ScaleFile.objects.upload_files(workspace, products_to_save)
def test_successfully(self, mock_copy, mock_chmod, mock_exists, mock_makedirs): """Tests calling NfsBroker.upload_files() successfully""" def new_exists(path): return False mock_exists.side_effect = new_exists volume_path = os.path.join('the', 'volume', 'path') file_name_1 = 'my_file.txt' file_name_2 = 'my_file.json' local_path_file_1 = os.path.join('my_dir_1', file_name_1) local_path_file_2 = os.path.join('my_dir_2', file_name_2) workspace_path_file_1 = os.path.join('my_wrk_dir_1', file_name_1) workspace_path_file_2 = os.path.join('my_wrk_dir_2', file_name_2) full_workspace_path_file_1 = os.path.join(volume_path, workspace_path_file_1) full_workspace_path_file_2 = os.path.join(volume_path, workspace_path_file_2) file_1 = storage_test_utils.create_file( file_path=workspace_path_file_1) file_2 = storage_test_utils.create_file( file_path=workspace_path_file_2) file_1_up = FileUpload(file_1, local_path_file_1) file_2_up = FileUpload(file_2, local_path_file_2) # Call method to test mountstats_data = """16 36 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw 17 36 0:16 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw 18 36 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=32977500k,nr_inodes=8244375,mode=755 19 17 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw 20 18 0:17 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw 21 18 0:11 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000 22 36 0:18 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,mode=755 23 17 0:19 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:8 - tmpfs tmpfs rw,mode=755 24 23 0:20 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd 25 17 0:21 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:19 - pstore pstore rw 26 23 0:22 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpuset 27 23 0:23 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpu,cpuacct 28 23 0:24 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory 29 23 0:25 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices 30 23 0:26 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,freezer 31 23 0:27 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,net_cls,net_prio 32 23 0:28 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,blkio 33 23 0:29 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,perf_event 34 23 0:30 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,hugetlb 35 17 0:31 / /sys/kernel/config rw,relatime shared:20 - configfs configfs rw 36 0 253:0 / / rw,relatime shared:1 - xfs /dev/mapper/vg_root-lv_root rw,attr2,inode64,noquota 14 36 0:14 / /users rw,relatime shared:22 - autofs systemd-1 rw,fd=29,pgrp=1,timeout=300,minproto=5,maxproto=5,direct 39 16 0:34 / /proc/sys/fs/binfmt_misc rw,relatime shared:25 - autofs systemd-1 rw,fd=37,pgrp=1,timeout=300,minproto=5,maxproto=5,direct 41 18 0:13 / /dev/mqueue rw,relatime shared:26 - mqueue mqueue rw 40 17 0:6 / /sys/kernel/debug rw,relatime shared:27 - debugfs debugfs rw 42 18 0:35 / /dev/hugepages rw,relatime shared:28 - hugetlbfs hugetlbfs rw 43 36 0:36 / /var/lib/nfs/rpc_pipefs rw,relatime shared:29 - rpc_pipefs sunrpc rw 44 16 0:37 / /proc/fs/nfsd rw,relatime shared:30 - nfsd nfsd rw 45 36 8:2 / /boot rw,relatime shared:31 - xfs /dev/sda2 rw,attr2,inode64,noquota 46 14 0:40 / /users rw,relatime shared:32 - nfs4 users:/users rw,vers=4.0,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=14,retrans=2,sec=sys,local_lock=none 49 39 0:38 / /proc/sys/fs/binfmt_misc rw,relatime shared:35 - binfmt_misc binfmt_misc rw 48 38 0:42 / %s rw,relatime shared:34 - nfs4 fserver:/exports/my_dir_1 rw,vers=4.0,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=14,retrans=2,sec=sys,local_lock=none """ % (os.path.abspath(volume_path), ) mo = mock_open(read_data=mountstats_data) # need to patch readlines() since only read() is patched in mock_open mo.return_value.readlines.return_value = mo.return_value.read.return_value.split( '\n') with patch('__builtin__.open', mo, create=True) as pmo: self.broker.upload_files(volume_path, [file_1_up, file_2_up]) # Check results two_calls = [ call(os.path.dirname(full_workspace_path_file_1), mode=0755), call(os.path.dirname(full_workspace_path_file_2), mode=0755) ] mock_makedirs.assert_has_calls(two_calls) two_calls = [ call(local_path_file_1, full_workspace_path_file_1), call(local_path_file_2, full_workspace_path_file_2) ] mock_copy.assert_has_calls(two_calls) two_calls = [ call(full_workspace_path_file_1, 0644), call(full_workspace_path_file_2, 0644) ] mock_chmod.assert_has_calls(two_calls)
def upload_files(self, file_entries, input_file_ids, job_exe, workspace): """Uploads the given local product files into the workspace. :param file_entries: List of files where each file is a tuple of (absolute local path, workspace path for storing the file, media_type, output_name) :type file_entries: list of tuple(str, str, str, str) :param input_file_ids: List of identifiers for files used to produce the given file entries :type input_file_ids: list of int :param job_exe: The job_exe model with the related job and job_type fields :type job_exe: :class:`job.models.JobExecution` :param workspace: The workspace to use for storing the product files :type workspace: :class:`storage.models.Workspace` :returns: The list of the saved product models :rtype: list of :class:`storage.models.ScaleFile` """ # Build a list of UUIDs for the input files input_files = ScaleFile.objects.filter(pk__in=input_file_ids).values('uuid', 'id').order_by('uuid') input_file_uuids = [f['uuid'] for f in input_files] # Get property names and values as strings properties = job_exe.job.get_job_data().get_all_properties() # Product UUID will be based in part on input data (UUIDs of input files and name/value pairs of input # properties) input_strings = input_file_uuids input_strings.extend(properties) # Determine if any input files are non-operational products input_products = ScaleFile.objects.filter(id__in=[f['id'] for f in input_files], file_type='PRODUCT') input_products_operational = all([f.is_operational for f in input_products]) # Compute the overall start and stop times for all file_entries source_files = FileAncestryLink.objects.get_source_ancestors([f['id'] for f in input_files]) start_times = [f.data_started for f in source_files] end_times = [f.data_ended for f in source_files] start_times.sort() end_times.sort(reverse=True) products_to_save = [] for entry in file_entries: local_path = entry[0] remote_path = entry[1] media_type = entry[2] output_name = entry[3] product = ProductFile.create() product.job_exe = job_exe product.job = job_exe.job product.job_type = job_exe.job.job_type product.is_operational = input_products_operational and job_exe.job.job_type.is_operational file_name = os.path.basename(local_path) file_size = os.path.getsize(local_path) product.set_basic_fields(file_name, file_size, media_type) product.file_path = remote_path product.job_output = output_name # Add a stable identifier based on the job type, input files, input properties, and file name # This is designed to remain stable across re-processing the same type of job on the same inputs product.update_uuid(job_exe.job.job_type.id, file_name, *input_strings) # Add geospatial info to product if available if len(entry) > 4: geo_metadata = entry[4] target_date = None if 'data_started' in geo_metadata: product.data_started = parse_datetime(geo_metadata['data_started']) target_date = product.data_started if 'data_ended' in geo_metadata: product.data_ended = parse_datetime(geo_metadata['data_ended']) if target_date is None: target_date = product.data_ended if target_date is None: target_date = product.created if 'geo_json' in geo_metadata: geom, props = geo_utils.parse_geo_json(geo_metadata['geo_json']) product.geometry = geom if props: product.meta_data = props product.center_point = geo_utils.get_center_point(geom) # Add recipe info to product if available. job_recipe = Recipe.objects.get_recipe_for_job(job_exe.job_id) if job_recipe: product.recipe_id = job_recipe.recipe.id product.recipe_type = job_recipe.recipe.recipe_type product.recipe_job = job_recipe.job_name # Add batch info to product if available. try: from batch.models import BatchJob product.batch_id = BatchJob.objects.get(job_id=job_exe.job_id).batch_id except BatchJob.DoesNotExist: product.batch_id = None # Add start and stop times if available if start_times: product.source_started = start_times[0] if end_times: product.source_ended = end_times[0] products_to_save.append(FileUpload(product, local_path)) return ScaleFile.objects.upload_files(workspace, products_to_save)
def move_files(file_ids, new_workspace=None, new_file_path=None): """Moves the given files to a different workspace/uri :param file_ids: List of ids of ScaleFile objects to move; should all be from the same workspace :type file_ids: [int] :param new_workspace: New workspace to move files to :type new_workspace: `storage.models.Workspace` :param new_file_path: New path for files :type new_file_path: string """ try: messages = [] files = ScaleFile.objects.all() files = files.select_related('workspace') files = files.defer('workspace__json_config') files = files.filter(id__in=file_ids).only('id', 'file_name', 'file_path', 'workspace') old_files = [] old_workspace = files[0].workspace if new_workspace: # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must # download the file and copy from there # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra # download is not necessary paths = old_workspace.get_file_system_paths([files]) local_paths = [] if paths: local_paths = paths else: file_downloads = [] for file in files: local_path = os.path.join('/tmp', file.file_name) file_downloads.append(FileDownload(file, local_path, False)) local_paths.append(local_path) ScaleFile.objects.download_files(file_downloads) uploads = [] for file, path in zip(files, local_paths): old_path = file.file_path old_files.append( ScaleFile(file_name=file.file_name, file_path=file.file_path)) file.file_path = new_file_path if new_file_path else file.file_path logger.info('Copying %s in workspace %s to %s in workspace %s', old_path, file.workspace.name, file.file_path, new_workspace.name) file_upload = FileUpload(file, path) uploads.append(file_upload) message = create_move_file_message(file_id=file.id) messages.append(message) ScaleFile.objects.upload_files(new_workspace, uploads) elif new_file_path: moves = [] for file in files: logger.info('Moving %s to %s in workspace %s', file.file_path, new_file_path, file.workspace.name) moves.append(FileMove(file, new_file_path)) message = create_move_file_message(file_id=file.id) messages.append(message) ScaleFile.objects.move_files(moves) else: logger.info('No new workspace or file path. Doing nothing') CommandMessageManager().send_messages(messages) if new_workspace: # Copied files to new workspace, so delete file in old workspace (if workspace provides local path to do so) old_workspace.delete_files(old_files, update_model=False) except ScaleError as err: err.log() sys.exit(err.exit_code) except Exception as ex: exit_code = GENERAL_FAIL_EXIT_CODE err = get_error_by_exception(ex.__class__.__name__) if err: err.log() exit_code = err.exit_code else: logger.exception('Error performing move_files steps') sys.exit(exit_code)
def perform_ingest(ingest_id): """Performs the ingest for the given ingest ID :param ingest_id: The ID of the ingest to perform :type ingest_id: int """ ingest = _get_ingest(ingest_id) file_name = ingest.file_name if ingest.status in ['INGESTED', 'DUPLICATE']: logger.warning('%s already marked %s, nothing to do', file_name, ingest.status) return _start_ingest(ingest) if ingest.status != 'INGESTING': return try: source_file = ingest.source_file if source_file.is_deleted: # Source file still marked as deleted, so we must copy/move/register the file source_file.set_basic_fields(file_name, ingest.file_size, ingest.media_type, ingest.get_data_type_tags()) source_file.update_uuid( file_name) # Add a stable identifier based on the file name source_file.workspace = ingest.workspace source_file.file_path = ingest.file_path source_file.is_deleted = False source_file.is_parsed = False source_file.deleted = None source_file.parsed = None if ingest.new_workspace: # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must # download the file and copy from there # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra # download is not necessary paths = ingest.workspace.get_file_system_paths([source_file]) if paths: local_path = paths[0] else: local_path = os.path.join('/tmp', file_name) file_download = FileDownload(source_file, local_path, False) ScaleFile.objects.download_files([file_download]) source_file.file_path = ingest.new_file_path if ingest.new_file_path else ingest.file_path logger.info('Copying %s in workspace %s to %s in workspace %s', ingest.file_path, ingest.workspace.name, source_file.file_path, ingest.new_workspace.name) file_upload = FileUpload(source_file, local_path) ScaleFile.objects.upload_files(ingest.new_workspace, [file_upload]) elif ingest.new_file_path: logger.info('Moving %s to %s in workspace %s', ingest.file_path, ingest.new_file_path, ingest.workspace.name) file_move = FileMove(source_file, ingest.new_file_path) ScaleFile.objects.move_files([file_move]) else: logger.info('Registering %s in workspace %s', ingest.file_path, ingest.workspace.name) _save_source_file(source_file) if ingest.new_workspace: # Copied file to new workspace, so delete file in old workspace (if workspace provides local path to do so) file_with_old_path = SourceFile.create() file_with_old_path.file_name = file_name file_with_old_path.file_path = ingest.file_path paths = ingest.workspace.get_file_system_paths( [file_with_old_path]) if paths: _delete_file(paths[0]) except Exception: _complete_ingest(ingest, 'ERRORED') raise _complete_ingest(ingest, 'INGESTED') logger.info('Ingest successful for %s', file_name)
def upload_files(self, file_entries, input_file_ids, job_exe, workspace): """Uploads the given local product files into the workspace. :param file_entries: List of files where each file is a tuple of (absolute local path, workspace path for storing the file, media_type) :type file_entries: list of tuple(str, str, str) :param input_file_ids: List of identifiers for files used to produce the given file entries :type input_file_ids: list of int :param job_exe: The job_exe model with the related job and job_type fields :type job_exe: :class:`job.models.JobExecution` :param workspace: The workspace to use for storing the product files :type workspace: :class:`storage.models.Workspace` :returns: The list of the saved product models :rtype: list of :class:`product.models.ProductFile` """ # Build a list of UUIDs for the input files input_files = ScaleFile.objects.filter(pk__in=input_file_ids).values( 'uuid', 'id').order_by('uuid') input_file_uuids = [f['uuid'] for f in input_files] # Determine if any input files are non-operational products input_products = ProductFile.objects.filter( file__in=[f['id'] for f in input_files]) input_products_operational = all( [f.is_operational for f in input_products]) products_to_save = [] for entry in file_entries: local_path = entry[0] remote_path = entry[1] media_type = entry[2] product = ProductFile() product.job_exe = job_exe product.job = job_exe.job product.job_type = job_exe.job.job_type product.is_operational = input_products_operational and job_exe.job.job_type.is_operational product.media_type = media_type product.file_path = remote_path # Add a stable identifier based on the job type, input files, and file name # This is designed to remain stable across re-processing the same type of job on the same inputs file_name = os.path.basename(local_path) product.update_uuid(job_exe.job.job_type.id, file_name, *input_file_uuids) # Add geospatial info to product if available if len(entry) > 3: geo_metadata = entry[3] target_date = None if 'data_started' in geo_metadata: product.data_started = parse_datetime( geo_metadata['data_started']) target_date = product.data_started if 'data_ended' in geo_metadata: product.data_ended = parse_datetime( geo_metadata['data_ended']) if target_date is None: target_date = product.data_ended if target_date is None: target_date = product.created if 'geo_json' in geo_metadata: geom, props = geo_utils.parse_geo_json( geo_metadata['geo_json']) product.geometry = geom product.meta_data = props product.center_point = geo_utils.get_center_point(geom) products_to_save.append(FileUpload(product, local_path)) return ScaleFile.objects.upload_files(workspace, products_to_save)
def _generate_input_metadata(self, job_exe): """Generate the input metadata file for the job execution :param job_id: The job ID :type job_id: int :param exe_num: The execution number :type exe_num: int """ job_interface = job_exe.job_type.get_job_interface() if not job_interface.needs_input_metadata(): return # Generate input metadata dict input_metadata = {} config = job_exe.get_execution_configuration if 'input_files' in config.get_dict(): input_metadata['JOB'] = {} input_data = job_exe.job.get_input_data() for i in input_data.values.keys(): if type(input_data.values[i]) is JsonValue: input_metadata['JOB'][i] = input_data.values[i].value elif type(input_data.values[i]) is FileValue: input_metadata['JOB'][i] = [ ScaleFile.objects.get(pk=f)._get_url() for f in input_data.values[i].file_ids ] if job_exe.recipe_id and job_exe.recipe.has_input(): input_metadata['RECIPE'] = {} input_data = job_exe.recipe.get_input_data() for i in input_data.values.keys(): if type(input_data.values[i]) is JsonValue: input_metadata['RECIPE'][i] = input_data.values[i].value elif type(input_data.values[i]) is FileValue: input_metadata['RECIPE'][i] = [ ScaleFile.objects.get(pk=f)._get_url() for f in input_data.values[i].file_ids ] workspace_names = config.get_input_workspace_names() workspace_models = { w.name: w for w in Workspace.objects.get_workspaces(names=workspace_names) } input_metadata_id = None if input_metadata: file_name = '%d-input_metadata.json' % job_exe.job.id local_path = os.path.join(SCALE_JOB_EXE_INPUT_PATH, 'tmp', file_name) with open(local_path, 'w') as metadata_file: json.dump(input_metadata, metadata_file) try: scale_file = ScaleFile.objects.get(file_name=file_name) except ScaleFile.DoesNotExist: scale_file = ScaleFile() scale_file.update_uuid(file_name) remote_path = self._calculate_remote_path(job_exe) scale_file.file_path = remote_path for workspace in workspace_models: try: if not input_metadata_id: ScaleFile.objects.upload_files( workspace, [FileUpload(scale_file, local_path)]) input_metadata_id = ScaleFile.objects.get( file_name=file_name).id data = job_exe.job.get_job_data() data.add_file_input('INPUT_METADATA_MANIFEST', input_metadata_id) job_exe.job.input = data.get_dict() job_exe.job.save() except: continue if not input_metadata_id: logger.exception( 'Error uploading input_metadata manifest for job_exe %d' % job_exe.job.id)