Пример #1
0
    def check_dataset(self, dataset, sandbox_path, host):
        """ Copy a dataset file, check the MD5, and return a report.

        :return: log_level, message
        """
        dest_filename = os.path.join(
            sandbox_path, 'ds_{}_{}'.format(dataset.id, dataset.name))

        if dataset.dataset_file:
            source_filename = dataset.dataset_file.path
        else:
            source_filename = dataset.external_absolute_path()

        if not os.path.exists(source_filename):
            return logging.ERROR, 'Dataset file missing: {!r}'.format(
                source_filename)

        shutil.copyfile(source_filename, dest_filename)
        with open(dest_filename, "rb") as f:
            new_md5 = compute_md5(f)

        if new_md5 != dataset.MD5_checksum:
            message = 'MD5 check failed on {}, dataset id {}: {!r} expected {}, but was {}.'.format(
                host, dataset.id, source_filename, dataset.MD5_checksum,
                new_md5)
            return logging.ERROR, message
        return logging.DEBUG, 'MD5 matched for dataset id {}.'.format(
            dataset.id)
Пример #2
0
    def check_dataset(self, dataset, sandbox_path, host):
        """ Copy a dataset file, check the MD5, and return a report.

        :return: log_level, message
        """
        dest_filename = os.path.join(sandbox_path,
                                     'ds_{}_{}'.format(dataset.id, dataset.name))

        if dataset.dataset_file:
            source_filename = dataset.dataset_file.path
        else:
            source_filename = dataset.external_absolute_path()

        if not os.path.exists(source_filename):
            return logging.ERROR, 'Dataset file missing: {!r}'.format(source_filename)

        shutil.copyfile(source_filename, dest_filename)
        with open(dest_filename, "rb") as f:
            new_md5 = compute_md5(f)

        if new_md5 != dataset.MD5_checksum:
            message = 'MD5 check failed on {}, dataset id {}: {!r} expected {}, but was {}.'.format(
                host,
                dataset.id,
                source_filename,
                dataset.MD5_checksum,
                new_md5)
            return logging.ERROR, message
        return logging.DEBUG, 'MD5 matched for dataset id {}.'.format(dataset.id)
Пример #3
0
    def test_execute_pipeline_dataset(self):
        """
        Test the integrity of a Dataset output by a PipelineStep in
        the middle of a Pipeline.
        NOTE: 2017-10-03: this tests fails because Method.submit_code has been removed
        for docker support.
        """
        # Figure out the MD5 of the output file created when the complement method
        # is run on Alice's data to check against the result of the run.
        tmpdir = tempfile.mkdtemp(dir=file_access_utils.sandbox_base_path())
        file_access_utils.configure_sandbox_permissions(tmpdir)
        outfile = os.path.join(tmpdir, "output")
        stdout_path = os.path.join(tmpdir, "stdout.txt")
        stderr_path = os.path.join(tmpdir, "stderr.txt")

        self.method_complement.install(tmpdir)

        # Set up the dummy scheduler.
        slurm_sched_class = DummySlurmScheduler
        slurm_sched_class.slurm_is_alive()

        complement_job_handle = self.method_complement.submit_code(
            tmpdir,
            [self.dataset_labdata.dataset_file.file.name],
            [outfile],
            stdout_path,
            stderr_path,
            slurm_sched_class=slurm_sched_class
        )

        is_done = False
        while not is_done:
            time.sleep(settings.DEFAULT_SLURM_CHECK_INTERVAL)
            accounting_info = DummySlurmScheduler.get_accounting_info([complement_job_handle])
            if len(accounting_info) > 0:
                curr_state = accounting_info[complement_job_handle.job_id]["state"]
                is_done = curr_state == DummySlurmScheduler.COMPLETED

        slurm_sched_class.shutdown()

        labdata_compd_md5 = file_access_utils.compute_md5(open(outfile))
        shutil.rmtree(tmpdir)

        run = self.comp_run
        runstep = run.runsteps.first()
        execrecord = runstep.execrecord
        dataset = execrecord.execrecordouts.first().dataset
        ds = runstep.outputs.first()

        self.assertEqual(dataset.MD5_checksum, labdata_compd_md5)
        self.assertEqual(dataset, ds)
        self.assertEqual(hasattr(dataset, "usurps"), False)
        self.assertEqual(dataset.has_data(), True)
        self.assertEqual(dataset.num_rows(), 10)
        self.assertEqual(dataset.is_raw(), False)
        self.assertEqual(dataset.get_cdt(), self.cdt_record)
        self.assertEqual(dataset.structure.compounddatatype, self.cdt_record)
        self.assertEqual(dataset.structure.num_rows, 10)
        self.assertEqual(dataset.is_OK(), True)
Пример #4
0
    def compute_md5(self):
        """Computes the MD5 checksum of the CodeResourceRevision."""
        try:
            self.content_file.open()
            md5 = file_access_utils.compute_md5(self.content_file.file)
        finally:
            self.content_file.close()

        return md5
Пример #5
0
 def compute_md5(self):
     """Computes the MD5 checksum of the Dataset.
     Return None if the file could not be accessed.
     """
     data_handle = self.get_open_file_handle("rb")
     if data_handle is None:
         self.logger.warning('cannot access file handle')
         return None
     with data_handle:
         return file_access_utils.compute_md5(data_handle.file)
Пример #6
0
 def is_md5_changed(self, dataset, found_file, changed_files):
     old_md5 = dataset.MD5_checksum
     with open(found_file, "rb") as f:
         new_md5 = file_access_utils.compute_md5(f)
     is_changed = new_md5 != old_md5
     if is_changed:
         if found_file not in changed_files:
             print('MD5 changed:',
                   old_md5,
                   'to',
                   new_md5,
                   found_file)
             changed_files.add(found_file)
     return is_changed
Пример #7
0
    def build(self):
        user = User.objects.first()
        assert user is not None
        input_path = os.path.abspath(
            os.path.join(
                __file__,
                '../../../../../samplecode/singularity/host_input/example_names.csv'
            ))
        family = ContainerFamily.objects.create(name='fixture family',
                                                user=user)
        container_path = os.path.abspath(
            os.path.join(
                __file__,
                '../../../../../samplecode/singularity/python2-alpine-trimmed.simg'
            ))
        with open(container_path, "rb") as f:
            container_md5 = compute_md5(f)
        container = family.containers.create(
            tag='vFixture',
            user=user,
            file='Containers/kive-default.simg',
            md5=container_md5)
        app = container.apps.create()
        arg1 = app.arguments.create(type=ContainerArgument.INPUT,
                                    name='names_csv',
                                    position=1)
        app.arguments.create(type=ContainerArgument.OUTPUT,
                             name='greetings_csv',
                             position=2)
        dataset = Dataset.create_dataset(input_path,
                                         name='names.csv',
                                         user=user)
        run = app.runs.create(name='fixture run', user=user)
        run.sandbox_path = ""  # blank this out as it won't be accessible in testing anyway
        run.slurm_job_id = None  # this also would cause tests to fail on a fresh system
        run.save(schedule=False)  # scheduling would overwrite sandbox_path
        run.datasets.create(argument=arg1, dataset=dataset)

        upload_path = os.path.join(settings.MEDIA_ROOT, Container.UPLOAD_DIR)
        readme_path = os.path.join(upload_path, 'README.md')
        os.makedirs(upload_path)
        with open(readme_path, 'w') as f:
            f.write('Just a placeholder to create the folder for containers.')
Пример #8
0
    def set_md5(self, file_path=None, file_handle=None):
        """Set the MD5 hash from a file.

        Closes the file after the MD5 is computed.
        :param str file_path:  Path to file to calculate MD5 for.
            Defaults to dataset_file.path, and not used if file_handle supplied.
        :param file file_handle: file handle of file to calculate MD5.  File
            must be seeked to the beginning.
            If file_handle empty, then uses file_path.
        """
        opened_file_ourselves = False
        if file_handle is None:
            if file_path is None:
                file_path = self.dataset_file.path
            file_handle = io.open(file_path, "rb")
            opened_file_ourselves = True

        try:
            self.MD5_checksum = file_access_utils.compute_md5(file_handle)
        finally:
            if opened_file_ourselves:
                file_handle.close()