class ManifestTest(unittest.TestCase): def setUp(self): self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag')) def set_hash_md5(self): self.bag.set_hash_encoding('md5') self.assertEquals(self.hash_encoding, u'md5') def set_hash_sha1(self): self.bag.set_hash_encoding('sha1') self.assertEquals(self.hash_encoding, u'sha1') def test_sha1(self): self.bag.set_hash_encoding('sha1') self.bag.update() self.assertEquals(self.bag.manifest_contents[os.path.join('data', 'subdir', 'subsubdir', 'angry.jpg')], u'c5913ae67aa40398f1182e52d2fa2c2e4c08f696') def test_md5(self): self.bag.set_hash_encoding('md5') self.bag.update() self.assertEquals(self.bag.manifest_contents[os.path.join('data', 'subdir', 'subsubdir', 'angry.jpg')], '5f294603675cb6c0f83cef9316bb5be7') def test_sha1_manifest(self): self.bag.set_hash_encoding('sha1') self.bag.update() self.assertEquals(os.path.basename(self.bag.manifest_file), 'manifest-sha1.txt') def test_md5_manifest(self): self.bag.set_hash_encoding('md5') self.bag.update() self.assertEquals(os.path.basename(self.bag.manifest_file), 'manifest-md5.txt')
class UpdateTest(unittest.TestCase): def setUp(self): self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag")) self.invalid_bag = BagIt( os.path.join(os.getcwd(), "test", "invalid_bag")) def tearDown(self): if os.path.exists(os.path.join(os.getcwd(), "test", "invalid_bag")): shutil.rmtree(os.path.join(os.getcwd(), "test", "invalid_bag")) def test_full_update(self): self.bag.update(full=True) self.assertEqual(len(self.bag.bag_errors), 0) def test_partial_update(self): self.bag.update(full=False) self.assertEqual(len(self.bag.bag_errors), 0) def test_is_valid(self): self.bag.update() self.assertEqual(self.bag.is_valid(), True) def test_not_valid(self): os.remove(self.invalid_bag.manifest_file) self.invalid_bag.validate() self.assertEqual(self.invalid_bag.is_valid(), False)
class UpdateTest(unittest.TestCase): def setUp(self): self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag')) self.invalid_bag = BagIt(os.path.join(os.getcwd(), 'test', 'invalid_bag')) def tearDown(self): if os.path.exists(os.path.join(os.getcwd(), 'test', 'invalid_bag')): shutil.rmtree(os.path.join(os.getcwd(), 'test', 'invalid_bag')) def test_full_update(self): self.bag.update(full=True) self.assertEquals(len(self.bag.bag_errors), 0) def test_partial_update(self): self.bag.update(full=False) self.assertEquals(len(self.bag.bag_errors), 0) def test_is_valid(self): self.bag.update() self.assertEquals(self.bag.is_valid(), True) def test_not_valid(self): os.remove(self.invalid_bag.manifest_file) self.invalid_bag.validate() self.assertEquals(self.invalid_bag.is_valid(), False)
class ManifestTest(unittest.TestCase): def setUp(self): self.bag = BagIt(os.path.join(os.getcwd(), "test", "testbag")) def set_hash_md5(self): self.bag.set_hash_encoding("md5") self.assertEqual(self.hash_encoding, "md5") def set_hash_sha1(self): self.bag.set_hash_encoding("sha1") self.assertEqual(self.hash_encoding, "sha1") def test_sha1(self): self.bag.set_hash_encoding("sha1") self.bag.update() self.assertEqual( self.bag.manifest_contents[os.path.join("data", "subdir", "subsubdir", "angry.jpg")], "c5913ae67aa40398f1182e52d2fa2c2e4c08f696", ) def test_md5(self): self.bag.set_hash_encoding("md5") self.bag.update() self.assertEqual( self.bag.manifest_contents[os.path.join("data", "subdir", "subsubdir", "angry.jpg")], "5f294603675cb6c0f83cef9316bb5be7", ) def test_sha1_manifest(self): self.bag.set_hash_encoding("sha1") self.bag.update() self.assertEqual(os.path.basename(self.bag.manifest_file), "manifest-sha1.txt") def test_md5_manifest(self): self.bag.set_hash_encoding("md5") self.bag.update() self.assertEqual(os.path.basename(self.bag.manifest_file), "manifest-md5.txt")
class ManifestTest(unittest.TestCase): def setUp(self): self.bag = BagIt(os.path.join(os.getcwd(), 'test', 'testbag')) def set_hash_md5(self): self.bag.set_hash_encoding('md5') self.assertEquals(self.hash_encoding, u'md5') def set_hash_sha1(self): self.bag.set_hash_encoding('sha1') self.assertEquals(self.hash_encoding, u'sha1') def test_sha1(self): self.bag.set_hash_encoding('sha1') self.bag.update() self.assertEquals( self.bag.manifest_contents[os.path.join('data', 'subdir', 'subsubdir', 'angry.jpg')], u'c5913ae67aa40398f1182e52d2fa2c2e4c08f696') def test_md5(self): self.bag.set_hash_encoding('md5') self.bag.update() self.assertEquals( self.bag.manifest_contents[os.path.join('data', 'subdir', 'subsubdir', 'angry.jpg')], '5f294603675cb6c0f83cef9316bb5be7') def test_sha1_manifest(self): self.bag.set_hash_encoding('sha1') self.bag.update() self.assertEquals(os.path.basename(self.bag.manifest_file), 'manifest-sha1.txt') def test_md5_manifest(self): self.bag.set_hash_encoding('md5') self.bag.update() self.assertEquals(os.path.basename(self.bag.manifest_file), 'manifest-md5.txt')
def run(self, rp_id): rp_query = ResultsPackage.objects.filter(uuid=rp_id) rp_query.update(status=task_status.PROCESSING, celery_task_id=self.request.id) rp = rp_query.first() mode = rp.packaging_mode package_path = get_package_path(rp_id) output_objs = ( Output.objects.filter( run_job__workflow_run=rp.workflow_run).select_related( "resource", "resource__resource_type", "resource_list", "run_job").prefetch_related("resource_list__resources"). annotate(is_endpoint=Case( When( condition=( Q(resource__isnull=False) & (Q(resource__inputs__isnull=True) | ~Q(resource__inputs__run_job__workflow_run=rp. workflow_run))) | (Q(resource_list__isnull=False) & (Q(resource_list__inputs__isnull=True) | ~Q(resource_list__inputs__run_job__workflow_run=rp. workflow_run))), then=Value(True), ), default=Value(False), output_field=BooleanField(), ))) if len(output_objs) > 0: percentage_increment = 70.00 / len(output_objs) else: percentage_increment = 0 completed = 0.0 with TemporaryDirectory() as td: tmp_dir = os.path.join( td, rp_id) # because rp_id will be name of the packaged zip bag = BagIt(tmp_dir) job_namefinder = self._NameFinder() res_namefinder = self._NameFinder() for output in output_objs: if mode == 0: # only endpoint resources, subdirectoried by different outputs # continue if not endpoint output if output.is_endpoint is False: continue j_name = job_namefinder.find( output.run_job.workflow_job_id, output.run_job.job_name) opt_name = output.output_port_type_name op_dir = os.path.join(tmp_dir, "{0} - {1}".format(j_name, opt_name)) rj_status = output.run_job.status if rj_status == task_status.FINISHED: if output.resource is not None: filepath = output.resource.resource_file.path ext = os.path.splitext(filepath)[1] res_name = res_namefinder.find( output.resource_id, output.resource.name ) # [TODO]: or... find the modified resource name if the resource_uuid still exists? result_filename = "{0}{1}".format(res_name, ext) if not os.path.exists(op_dir): os.makedirs(op_dir) shutil.copyfile( filepath, os.path.join(op_dir, result_filename)) elif output.resource_list is not None: res_name = res_namefinder.find( output.resource_list_id, output.resource_list.name ) # [TODO]: or... find the modified resource name if the resource_uuid still exists? result_foldername = "{0}.list".format(res_name) result_folder = os.path.join( op_dir, result_foldername) if not os.path.exists(result_folder): os.makedirs(result_folder) cnt = output.resource_list.resources.count() zfills = len(str(cnt)) for idx, r in enumerate( output.resource_list.resources.all()): filepath = r.resource_file.path ext = os.path.splitext(filepath)[1] new_filename = "{0}{1}".format( str(idx).zfill(zfills), ext) shutil.copyfile( filepath, os.path.join(result_folder, new_filename)) elif mode == 1: res_name = res_namefinder.find( output.resource_id, output.resource.name ) # [TODO]: or... find the modified resource name if the resource_uuid still exists? res_dir = os.path.join(tmp_dir, res_name) j_name = job_namefinder.find( output.run_job.workflow_job_id, output.run_job.job_name) opt_name = output.output_port_type_name rj_status = output.run_job.status if rj_status == task_status.FINISHED: if output.resource is not None: filepath = output.resource.resource_file.path ext = os.path.splitext(filepath)[1] result_filename = "{0} - {1}{2}".format( j_name, opt_name, ext) if not os.path.exists(res_dir): os.makedirs(res_dir) shutil.copyfile( filepath, os.path.join(res_dir, result_filename)) elif output.resource_list is not None: result_foldername = "{0} - {1}.list".format( j_name, opt_name) result_folder = os.path.join( res_dir, result_foldername) if not os.path.exists(result_folder): os.makedirs(result_folder) cnt = output.resource_list.resources.count() zfills = len(str(cnt)) for idx, r in enumerate( output.resource_list.resources.all()): filepath = r.resource_file.path ext = os.path.splitext(filepath)[1] new_filename = "{0}{1}".format( str(idx).zfill(zfills), ext) shutil.copyfile( filepath, os.path.join(result_folder, new_filename)) elif rj_status == task_status.FAILED: result_filename = "{0} - {1} - ERROR.txt".format( j_name, opt_name) if not os.path.exists(res_dir): os.makedirs(res_dir) with open(os.path.join(res_dir, result_filename), "w") as f: f.write("Error Summary: ") f.write(output.run_job.error_summary) f.write("\n\nError Details:\n") f.write(output.run_job.error_details) elif mode == 2: raise NotImplementedError() # [TODO] else: raise ValueError("mode {0} is not supported".format(mode)) completed += percentage_increment rp_query.update(percent_completed=int(completed)) # print([os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(tmp_dir)) for f in fn]) # DEBUG bag.update() errors = bag.validate() if not bag.is_valid: rp_query.update( status=task_status.FAILED, error_summary="The bag failed validation.", error_details=str(errors), ) target_dir_name = os.path.dirname(package_path) if not os.path.isdir(target_dir_name): os.makedirs(target_dir_name) bag.package(target_dir_name, method="zip") rp_query.update(status=task_status.FINISHED, percent_completed=100) expiry_time = rp_query.values_list("expiry_time", flat=True)[0] if expiry_time: async_task = registry.tasks[ "rodan.core.expire_package"].apply_async((rp_id, ), eta=expiry_time, queue="celery") expire_task_id = async_task.task_id else: expire_task_id = None rp_query.update(celery_task_id=expire_task_id) return True
def run(self, package_id, *args, **kwargs): resultspackage = ResultsPackage.objects.get(pk=package_id) if resultspackage.status == RunJobStatus.CANCELLED: return resultspackage.status = ResultsPackageStatus.PROCESSING resultspackage.save() runjobs = resultspackage.workflow_run.run_jobs.select_related( 'page', 'job').all() if not resultspackage.pages.exists(): pages = set() for runjob in runjobs: pages.add(runjob.page) else: pages = resultspackage.pages.all() jobs = resultspackage.jobs.all() self.package_path = resultspackage.package_path # The chunks are intervals used to update the percent_completed field. if len(pages) > 0: page_chunk = 70.00 / len(pages) completed = 0.0 bag = BagIt(resultspackage.bag_path) for page in pages: page_dir = os.path.join(bag.data_directory, page.name) os.makedirs(page_dir) page_runjobs = runjobs.filter(page=page) if not jobs: # If no jobs are provided, we will just make a list of jobs from the available runjobs. jobs = [] if len(page_runjobs) > 0: runjob_chunk = page_chunk / len(page_runjobs) for runjob in page_runjobs: _add_result_to_bag(page_dir, runjob, bag) completed += runjob_chunk _ensure_db_state(resultspackage) _update_progress(resultspackage, completed) if runjob.workflow_job.job not in jobs: jobs.append(runjob.workflow_job.job) else: if len(jobs) > 0: job_chunk = page_chunk / len(jobs) for job in jobs: matcthing_runjobs = page_runjobs.filter( workflow_job__job=job) if len(matcthing_runjobs) > 0: runjob_chunk = job_chunk / len(matcthing_runjobs) for runjob in matcthing_runjobs: _add_result_to_bag(page_dir, runjob, bag) completed += runjob_chunk _ensure_db_state(resultspackage) _update_progress(resultspackage, completed) bag.update() errors = bag.validate() if not bag.is_valid: _ensure_db_state(resultspackage) resultspackage.status = ResultsPackageStatus.FAILED resultspackage.save() raise BagNotValidError("The bag failed validation.\n" + str(errors)) bag.package(resultspackage.package_path, method='zip') resultspackage.download_url = resultspackage.file_url resultspackage.percent_completed = 100 resultspackage.status = ResultsPackageStatus.COMPLETE # If pages and jobs were not provided, we populate these fields now # since we have figured them out. resultspackage.pages = pages resultspackage.jobs = jobs _ensure_db_state(resultspackage) resultspackage.save() shutil.rmtree(resultspackage.bag_path)
def run(self, rp_id): rp_query = ResultsPackage.objects.filter(uuid=rp_id) rp_query.update(status=task_status.PROCESSING, celery_task_id=self.request.id) rp = rp_query.first() mode = rp.packaging_mode package_path = get_package_path(rp_id) output_objs = Output.objects.filter( run_job__workflow_run=rp.workflow_run ).select_related( 'resource', 'resource__resource_type', 'resource_list', 'run_job' ).prefetch_related( 'resource_list__resources' ).annotate( is_endpoint=Case( When( condition=( Q(resource__isnull=False) & ( Q(resource__inputs__isnull=True) | ~Q(resource__inputs__run_job__workflow_run=rp.workflow_run) ) ) | ( Q(resource_list__isnull=False) & ( Q(resource_list__inputs__isnull=True) | ~Q(resource_list__inputs__run_job__workflow_run=rp.workflow_run) ) ), then=Value(True) ), default=Value(False), output_field=BooleanField() ) ) if len(output_objs) > 0: percentage_increment = 70.00 / len(output_objs) else: percentage_increment = 0 completed = 0.0 with TemporaryDirectory() as td: tmp_dir = os.path.join(td, rp_id) # because rp_id will be name of the packaged zip bag = BagIt(tmp_dir) job_namefinder = self._NameFinder() res_namefinder = self._NameFinder() for output in output_objs: if mode == 0: # only endpoint resources, subdirectoried by different outputs # continue if not endpoint output if output.is_endpoint is False: continue j_name = job_namefinder.find(output.run_job.workflow_job_id, output.run_job.job_name) opt_name = output.output_port_type_name op_dir = os.path.join(tmp_dir, "{0} - {1}".format(j_name, opt_name)) rj_status = output.run_job.status if rj_status == task_status.FINISHED: if output.resource is not None: filepath = output.resource.resource_file.path ext = os.path.splitext(filepath)[1] res_name = res_namefinder.find(output.resource_id, output.resource.name) # [TODO]: or... find the modified resource name if the resource_uuid still exists? result_filename = "{0}{1}".format(res_name, ext) if not os.path.exists(op_dir): os.makedirs(op_dir) shutil.copyfile(filepath, os.path.join(op_dir, result_filename)) elif output.resource_list is not None: res_name = res_namefinder.find(output.resource_list_id, output.resource_list.name) # [TODO]: or... find the modified resource name if the resource_uuid still exists? result_foldername = "{0}.list".format(res_name) result_folder = os.path.join(op_dir, result_foldername) if not os.path.exists(result_folder): os.makedirs(result_folder) cnt = output.resource_list.resources.count() zfills = len(str(cnt)) for idx, r in enumerate(output.resource_list.resources.all()): filepath = r.resource_file.path ext = os.path.splitext(filepath)[1] new_filename = "{0}{1}".format(str(idx).zfill(zfills), ext) shutil.copyfile(filepath, os.path.join(result_folder, new_filename)) elif mode == 1: res_name = res_namefinder.find(output.resource_id, output.resource.name) # [TODO]: or... find the modified resource name if the resource_uuid still exists? res_dir = os.path.join(tmp_dir, res_name) j_name = job_namefinder.find(output.run_job.workflow_job_id, output.run_job.job_name) opt_name = output.output_port_type_name rj_status = output.run_job.status if rj_status == task_status.FINISHED: if output.resource is not None: filepath = output.resource.resource_file.path ext = os.path.splitext(filepath)[1] result_filename = "{0} - {1}{2}".format(j_name, opt_name, ext) if not os.path.exists(res_dir): os.makedirs(res_dir) shutil.copyfile(filepath, os.path.join(res_dir, result_filename)) elif output.resource_list is not None: result_foldername = "{0} - {1}.list".format(j_name, opt_name) result_folder = os.path.join(res_dir, result_foldername) if not os.path.exists(result_folder): os.makedirs(result_folder) cnt = output.resource_list.resources.count() zfills = len(str(cnt)) for idx, r in enumerate(output.resource_list.resources.all()): filepath = r.resource_file.path ext = os.path.splitext(filepath)[1] new_filename = "{0}{1}".format(str(idx).zfill(zfills), ext) shutil.copyfile(filepath, os.path.join(result_folder, new_filename)) elif rj_status == task_status.FAILED: result_filename = "{0} - {1} - ERROR.txt".format(j_name, opt_name) if not os.path.exists(res_dir): os.makedirs(res_dir) with open(os.path.join(res_dir, result_filename), 'w') as f: f.write("Error Summary: ") f.write(output.run_job.error_summary) f.write("\n\nError Details:\n") f.write(output.run_job.error_details) elif mode == 2: raise NotImplementedError() # [TODO] else: raise ValueError("mode {0} is not supported".format(mode)) completed += percentage_increment rp_query.update(percent_completed=int(completed)) #print [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(tmp_dir)) for f in fn] # DEBUG bag.update() errors = bag.validate() if not bag.is_valid: rp_query.update(status=task_status.FAILED, error_summary="The bag failed validation.", error_details=str(errors)) target_dir_name = os.path.dirname(package_path) if not os.path.isdir(target_dir_name): os.makedirs(target_dir_name) bag.package(target_dir_name, method='zip') rp_query.update(status=task_status.FINISHED, percent_completed=100) expiry_time = rp_query.values_list('expiry_time', flat=True)[0] if expiry_time: async_task = registry.tasks['rodan.core.expire_package'].apply_async((rp_id, ), eta=expiry_time) expire_task_id = async_task.task_id else: expire_task_id = None rp_query.update(celery_task_id=expire_task_id) return True
def run(self, package_id, *args, **kwargs): resultspackage = ResultsPackage.objects.get(pk=package_id) if resultspackage.status == RunJobStatus.CANCELLED: return resultspackage.status = ResultsPackageStatus.PROCESSING resultspackage.save() runjobs = resultspackage.workflow_run.run_jobs.select_related('page', 'job').all() if not resultspackage.pages.exists(): pages = set() for runjob in runjobs: pages.add(runjob.page) else: pages = resultspackage.pages.all() jobs = resultspackage.jobs.all() self.package_path = resultspackage.package_path # The chunks are intervals used to update the percent_completed field. if len(pages) > 0: page_chunk = 70.00 / len(pages) completed = 0.0 bag = BagIt(resultspackage.bag_path) for page in pages: page_dir = os.path.join(bag.data_directory, page.name) os.makedirs(page_dir) page_runjobs = runjobs.filter(page=page) if not jobs: # If no jobs are provided, we will just make a list of jobs from the available runjobs. jobs = [] if len(page_runjobs) > 0: runjob_chunk = page_chunk / len(page_runjobs) for runjob in page_runjobs: _add_result_to_bag(page_dir, runjob, bag) completed += runjob_chunk _ensure_db_state(resultspackage) _update_progress(resultspackage, completed) if runjob.workflow_job.job not in jobs: jobs.append(runjob.workflow_job.job) else: if len(jobs) > 0: job_chunk = page_chunk / len(jobs) for job in jobs: matcthing_runjobs = page_runjobs.filter(workflow_job__job=job) if len(matcthing_runjobs) > 0: runjob_chunk = job_chunk / len(matcthing_runjobs) for runjob in matcthing_runjobs: _add_result_to_bag(page_dir, runjob, bag) completed += runjob_chunk _ensure_db_state(resultspackage) _update_progress(resultspackage, completed) bag.update() errors = bag.validate() if not bag.is_valid: _ensure_db_state(resultspackage) resultspackage.status = ResultsPackageStatus.FAILED resultspackage.save() raise BagNotValidError("The bag failed validation.\n" + str(errors)) bag.package(resultspackage.package_path, method='zip') resultspackage.download_url = resultspackage.file_url resultspackage.percent_completed = 100 resultspackage.status = ResultsPackageStatus.COMPLETE # If pages and jobs were not provided, we populate these fields now # since we have figured them out. resultspackage.pages = pages resultspackage.jobs = jobs _ensure_db_state(resultspackage) resultspackage.save() shutil.rmtree(resultspackage.bag_path)