def check_dataset(self, dataset, sandbox_path, host): """ Copy a dataset file, check the MD5, and return a report. :return: log_level, message """ dest_filename = os.path.join( sandbox_path, 'ds_{}_{}'.format(dataset.id, dataset.name)) if dataset.dataset_file: source_filename = dataset.dataset_file.path else: source_filename = dataset.external_absolute_path() if not os.path.exists(source_filename): return logging.ERROR, 'Dataset file missing: {!r}'.format( source_filename) shutil.copyfile(source_filename, dest_filename) with open(dest_filename, "rb") as f: new_md5 = compute_md5(f) if new_md5 != dataset.MD5_checksum: message = 'MD5 check failed on {}, dataset id {}: {!r} expected {}, but was {}.'.format( host, dataset.id, source_filename, dataset.MD5_checksum, new_md5) return logging.ERROR, message return logging.DEBUG, 'MD5 matched for dataset id {}.'.format( dataset.id)
def check_dataset(self, dataset, sandbox_path, host): """ Copy a dataset file, check the MD5, and return a report. :return: log_level, message """ dest_filename = os.path.join(sandbox_path, 'ds_{}_{}'.format(dataset.id, dataset.name)) if dataset.dataset_file: source_filename = dataset.dataset_file.path else: source_filename = dataset.external_absolute_path() if not os.path.exists(source_filename): return logging.ERROR, 'Dataset file missing: {!r}'.format(source_filename) shutil.copyfile(source_filename, dest_filename) with open(dest_filename, "rb") as f: new_md5 = compute_md5(f) if new_md5 != dataset.MD5_checksum: message = 'MD5 check failed on {}, dataset id {}: {!r} expected {}, but was {}.'.format( host, dataset.id, source_filename, dataset.MD5_checksum, new_md5) return logging.ERROR, message return logging.DEBUG, 'MD5 matched for dataset id {}.'.format(dataset.id)
def test_execute_pipeline_dataset(self): """ Test the integrity of a Dataset output by a PipelineStep in the middle of a Pipeline. NOTE: 2017-10-03: this tests fails because Method.submit_code has been removed for docker support. """ # Figure out the MD5 of the output file created when the complement method # is run on Alice's data to check against the result of the run. tmpdir = tempfile.mkdtemp(dir=file_access_utils.sandbox_base_path()) file_access_utils.configure_sandbox_permissions(tmpdir) outfile = os.path.join(tmpdir, "output") stdout_path = os.path.join(tmpdir, "stdout.txt") stderr_path = os.path.join(tmpdir, "stderr.txt") self.method_complement.install(tmpdir) # Set up the dummy scheduler. slurm_sched_class = DummySlurmScheduler slurm_sched_class.slurm_is_alive() complement_job_handle = self.method_complement.submit_code( tmpdir, [self.dataset_labdata.dataset_file.file.name], [outfile], stdout_path, stderr_path, slurm_sched_class=slurm_sched_class ) is_done = False while not is_done: time.sleep(settings.DEFAULT_SLURM_CHECK_INTERVAL) accounting_info = DummySlurmScheduler.get_accounting_info([complement_job_handle]) if len(accounting_info) > 0: curr_state = accounting_info[complement_job_handle.job_id]["state"] is_done = curr_state == DummySlurmScheduler.COMPLETED slurm_sched_class.shutdown() labdata_compd_md5 = file_access_utils.compute_md5(open(outfile)) shutil.rmtree(tmpdir) run = self.comp_run runstep = run.runsteps.first() execrecord = runstep.execrecord dataset = execrecord.execrecordouts.first().dataset ds = runstep.outputs.first() self.assertEqual(dataset.MD5_checksum, labdata_compd_md5) self.assertEqual(dataset, ds) self.assertEqual(hasattr(dataset, "usurps"), False) self.assertEqual(dataset.has_data(), True) self.assertEqual(dataset.num_rows(), 10) self.assertEqual(dataset.is_raw(), False) self.assertEqual(dataset.get_cdt(), self.cdt_record) self.assertEqual(dataset.structure.compounddatatype, self.cdt_record) self.assertEqual(dataset.structure.num_rows, 10) self.assertEqual(dataset.is_OK(), True)
def compute_md5(self): """Computes the MD5 checksum of the CodeResourceRevision.""" try: self.content_file.open() md5 = file_access_utils.compute_md5(self.content_file.file) finally: self.content_file.close() return md5
def compute_md5(self): """Computes the MD5 checksum of the Dataset. Return None if the file could not be accessed. """ data_handle = self.get_open_file_handle("rb") if data_handle is None: self.logger.warning('cannot access file handle') return None with data_handle: return file_access_utils.compute_md5(data_handle.file)
def is_md5_changed(self, dataset, found_file, changed_files): old_md5 = dataset.MD5_checksum with open(found_file, "rb") as f: new_md5 = file_access_utils.compute_md5(f) is_changed = new_md5 != old_md5 if is_changed: if found_file not in changed_files: print('MD5 changed:', old_md5, 'to', new_md5, found_file) changed_files.add(found_file) return is_changed
def build(self): user = User.objects.first() assert user is not None input_path = os.path.abspath( os.path.join( __file__, '../../../../../samplecode/singularity/host_input/example_names.csv' )) family = ContainerFamily.objects.create(name='fixture family', user=user) container_path = os.path.abspath( os.path.join( __file__, '../../../../../samplecode/singularity/python2-alpine-trimmed.simg' )) with open(container_path, "rb") as f: container_md5 = compute_md5(f) container = family.containers.create( tag='vFixture', user=user, file='Containers/kive-default.simg', md5=container_md5) app = container.apps.create() arg1 = app.arguments.create(type=ContainerArgument.INPUT, name='names_csv', position=1) app.arguments.create(type=ContainerArgument.OUTPUT, name='greetings_csv', position=2) dataset = Dataset.create_dataset(input_path, name='names.csv', user=user) run = app.runs.create(name='fixture run', user=user) run.sandbox_path = "" # blank this out as it won't be accessible in testing anyway run.slurm_job_id = None # this also would cause tests to fail on a fresh system run.save(schedule=False) # scheduling would overwrite sandbox_path run.datasets.create(argument=arg1, dataset=dataset) upload_path = os.path.join(settings.MEDIA_ROOT, Container.UPLOAD_DIR) readme_path = os.path.join(upload_path, 'README.md') os.makedirs(upload_path) with open(readme_path, 'w') as f: f.write('Just a placeholder to create the folder for containers.')
def set_md5(self, file_path=None, file_handle=None): """Set the MD5 hash from a file. Closes the file after the MD5 is computed. :param str file_path: Path to file to calculate MD5 for. Defaults to dataset_file.path, and not used if file_handle supplied. :param file file_handle: file handle of file to calculate MD5. File must be seeked to the beginning. If file_handle empty, then uses file_path. """ opened_file_ourselves = False if file_handle is None: if file_path is None: file_path = self.dataset_file.path file_handle = io.open(file_path, "rb") opened_file_ourselves = True try: self.MD5_checksum = file_access_utils.compute_md5(file_handle) finally: if opened_file_ourselves: file_handle.close()