Exemplo n.º 1
0
def test_relative_path_list(tmp_path):
    file_list = tmp_path / "list.txt"
    CharSink.to_file(file_list).write("\n".join(["fred/bob.txt", "foo.txt"]))
    params = Parameters.from_mapping({"file_list": str(file_list)})
    assert list(
        params.path_list_from_file(
            "file_list", resolve_relative_to=Path("/hello/world"))) == [
                Path("/hello/world/fred/bob.txt"),
                Path("/hello/world/foo.txt")
            ]
Exemplo n.º 2
0
def test_relative_path_map(tmp_path):
    file_map = tmp_path / "map.txt"
    CharSink.to_file(file_map).write("\n".join(
        ["one\tfred/bob.txt", "two\tfoo.txt"]))
    params = Parameters.from_mapping({"file_map": str(file_map)})
    assert dict(
        params.path_map_from_file(
            "file_map", resolve_relative_to=Path("/hello/world"))) == {
                "one": Path("/hello/world/fred/bob.txt"),
                "two": Path("/hello/world/foo.txt")
            }
Exemplo n.º 3
0
 def test_to_file_open(self):
     tmp_dir = Path(tempfile.mkdtemp())
     file_path = tmp_dir / "test.txt"
     with CharSink.to_file(file_path).open() as out:
         out.write("hello\n\nworld\n")
     source = CharSource.from_file(file_path)
     self.assertEqual("hello\n\nworld\n", source.read_all())
     shutil.rmtree(str(tmp_dir))
Exemplo n.º 4
0
 def test_to_file_write_string_arg(self):
     tmp_dir = Path(tempfile.mkdtemp())
     file_path = tmp_dir / "test.txt"
     sink = CharSink.to_file(str(file_path))
     sink.write("hello\n\nworld\n")
     source = CharSource.from_file(str(file_path))
     self.assertEqual("hello\n\nworld\n", source.read_all())
     shutil.rmtree(str(tmp_dir))
Exemplo n.º 5
0
    def test_writing_to_yaml(self):
        params = Parameters.from_mapping({
            "hello":
            "world",
            "moo": {
                "nested_dict": {
                    "lalala": "fooo",
                    "meep": 2,
                    "list": [1, 2, 3]
                }
            },
            "some_path":
            Path("/hello/world"),
            "path_list": [Path("/meep/lalala"),
                          Path("/moo/cow")],
        })
        string_buffer = CharSink.to_string()
        YAMLParametersWriter().write(params, string_buffer)
        self.assertEqual(TestParameters.WRITING_REFERENCE,
                         string_buffer.last_string_written)

        with self.assertRaisesRegex(
                RuntimeError,
                "bytes and bytearrays are not legal parameter values"):
            YAMLParametersWriter().write(
                Parameters.from_mapping({"illegal": b"bytes"}),
                CharSink.to_nowhere())

        with self.assertRaisesRegex(
                RuntimeError,
                "bytes and bytearrays are not legal parameter values"):
            YAMLParametersWriter().write(
                Parameters.from_mapping({"illegal": bytearray()}),
                CharSink.to_nowhere())

        with self.assertRaisesRegex(
                RuntimeError,
                "Don't know how to serialize out .* as a parameter value"):
            YAMLParametersWriter().write(
                Parameters.from_mapping({"illegal": Parameters}),
                CharSink.to_nowhere())
Exemplo n.º 6
0
    def test_read_write_doc_id_to_file_map(self):
        mapping = ImmutableDict.of([("foo", Path("/home/foo")),
                                    ("bar", Path("/home/bar"))])
        string_sink = CharSink.to_string()
        write_doc_id_to_file_map(mapping, string_sink)
        # note the reordering because it alphabetizes the docids
        self.assertEqual("bar\t/home/bar\nfoo\t/home/foo\n",
                         string_sink.last_string_written)

        reloaded_map = read_doc_id_to_file_map(
            CharSource.from_string(string_sink.last_string_written))

        self.assertEqual(mapping, reloaded_map)
Exemplo n.º 7
0
def _split_into_even_slices(input_source: KeyValueSource[str, bytes],
                            params: Parameters):
    output_directory = params.creatable_directory("output_dir")
    slices = params.positive_integer("num_slices")
    random_seed = params.optional_positive_integer("random_seed")
    slice_paths = [
        output_directory / "{!s}.zip".format(i) for i in range(slices)
    ]
    CharSink.to_file(output_directory / "_slices.txt").write("\n".join(
        str(x) for x in slice_paths))
    output_sinks = [
        KeyValueSink.zip_bytes_sink(slice_path) for slice_path in slice_paths
    ]
    # this is the magic incantation for handling variable-length lists of context managers
    with ExitStack() as exit_stack:
        for output_sink in output_sinks:
            exit_stack.enter_context(output_sink)
        input_keys = sorted(list(input_source.keys())  # type: ignore
                            )  # guarantee deterministic iteration order
        if random_seed:
            random.seed(random_seed)
            random.shuffle(input_keys)
        for (i, k) in enumerate(input_keys):
            output_sinks[i % slices].put(k, input_source[k])
Exemplo n.º 8
0
    def write_shell_script_to(
        self,
        entry_point_name: str,
        parameters: Union[Path, Parameters],
        *,
        working_directory: Path,
        script_path: Path,
        params_path: Optional[Path],
        stdout_file: Optional[Path] = None,
        ckpt_path: Optional[Path] = None,
        override_conda_config: Optional[CondaConfiguration] = None,
    ) -> None:
        if isinstance(parameters, Path):
            if params_path:
                raise RuntimeError(
                    "Cannot specify params_path and provide a path for parameters"
                )
            params_path = parameters
        elif isinstance(parameters, Parameters):
            if not params_path:
                raise RuntimeError(
                    "Params path must be specified when providing a parameters object"
                )
            YAMLParametersWriter().write(parameters,
                                         CharSink.to_file(params_path))
        else:
            raise RuntimeError(
                f"Parameters must be either Parameters or path to a param file, "
                f"but got {parameters}")

        if not stdout_file:
            stdout_file = working_directory / "___stdout.log"

        script_path.write_text(
            self.generate_shell_script(
                entry_point_name=entry_point_name,
                param_file=params_path,
                stdout_file=stdout_file,
                working_directory=working_directory,
                ckpt_path=ckpt_path,
                override_conda_config=override_conda_config,
            ),
            encoding="utf-8",
        )
        # Mark the generated script as executable.
        script_path.chmod(script_path.stat().st_mode | stat.S_IEXEC)
Exemplo n.º 9
0
 def test_null_sink(self):
     sink = CharSink.to_nowhere()
     sink.write("foo")
     with sink.open() as out:
         out.write("meep")
Exemplo n.º 10
0
 def test_string_sink(self):
     string_sink = CharSink.to_string()
     string_sink.write("hello world")
     self.assertEqual("hello world", string_sink.last_string_written)
Exemplo n.º 11
0
    def _run_python_in_container(
        self,
        job_name: Locator,
        python_module_or_path_on_docker: Union[str, Path],
        python_args_or_parameters: Union[Parameters, str],
        container: Container,
        *,
        depends_on,
        docker_args: str = "",
        python_executable_path_in_docker: Path = PYTHON_EXECUTABLE_DOCKER_PATH,
        input_files: Union[Iterable[Union[Path, str]], Path,
                           str] = immutableset(),
        output_files: Union[Iterable[Union[Path, str]], Path,
                            str] = immutableset(),
        docker_mount_root: Path = DOCKER_MOUNT_ROOT,
        resource_request: Optional[ResourceRequest] = None,
        category: Optional[str] = None,
        pre_docker_bash: Union[Iterable[str], str] = "",
        post_docker_bash: Union[Iterable[str], str] = "",
        job_is_stageable: bool = False,
        job_bypass_staging: bool = False,
        times_to_retry_job: int = 0,
        job_profiles: Iterable[PegasusProfile] = immutableset(),
    ) -> DependencyNode:
        """
        Automatically converts a python job into a container request
        """
        # Ensure the input and output files are iterables of Path or str
        if isinstance(input_files, (Path, str)):
            input_files = immutableset([input_files])
        if isinstance(output_files, (Path, str)):
            output_files = immutableset([output_files])
        # A set to keep track of all the file names that will be created or copied into
        # The mounted directory. We use this to raise errors if a duplicate name would appear
        params_file_name = "____params.params"
        params_file = None
        file_names = set(params_file_name)
        job_dir = self.directory_for(job_name)
        # Define the root mount point for scratch mount
        scratch_root = DOCKERMOUNT_SCRATCH_PATH_ROOT / self.name / str(
            job_name)
        # Define the self-needed docker args
        modified_docker_args = (
            f"--rm -v {scratch_root}:{docker_mount_root} " + docker_args)

        # Build paths mappings for docker
        mapping_input_files = []
        for i_file in input_files:
            if i_file.name in file_names:
                raise RuntimeError(
                    f"Unable to create container job {job_name} with multiple files with name {i_file.name}"
                )
            file_names.add(i_file.name)
            mapping_input_files.append((
                str(i_file.absolute()),
                PegasusContainerFile(
                    name=i_file.name,
                    nas=i_file,
                    scratch=scratch_root / i_file.name,
                    docker=docker_mount_root / i_file.name,
                ),
            ))
        converted_input_files = immutabledict(mapping_input_files)

        mapping_output_files = []
        for o_file in output_files:
            if o_file.name in file_names:
                raise RuntimeError(
                    f"Unable to create container job {job_name} with multiple files with name {o_file.name}"
                )
            file_names.add(o_file.name)
            mapping_output_files.append((
                str(o_file.absolute()),
                PegasusContainerFile(
                    name=o_file.name,
                    nas=o_file,
                    scratch=scratch_root / o_file.name,
                    docker=docker_mount_root / o_file.name,
                ),
            ))
        converted_output_files = immutabledict(mapping_output_files)

        # Process the Python Parameters or Args for any file paths which need to change
        if isinstance(python_args_or_parameters, Parameters):
            mutable_params = dict(python_args_or_parameters.as_mapping())
            for key, value in python_args_or_parameters.as_mapping().items():
                if isinstance(value, Path):
                    if str(value.absolute()) in converted_input_files:
                        mutable_params[key] = str(converted_input_files[str(
                            value.absolute())].docker.absolute())
                    elif str(value.absolute()) in converted_output_files:
                        mutable_params[key] = str(converted_output_files[str(
                            value.absolute())].docker.absolute())
            modified_params = Parameters.from_mapping(mutable_params)
            params_path = job_dir / params_file_name
            YAMLParametersWriter().write(modified_params,
                                         CharSink.to_file(params_path))
            params_file = PegasusContainerFile(
                name=params_file_name,
                nas=params_path,
                scratch=scratch_root / params_file_name,
                docker=docker_mount_root / params_file_name,
            )
            python_args = params_file.docker
        elif isinstance(python_args_or_parameters, str):
            python_args_tok = []
            for tok in python_args_or_parameters.split(" "):
                if tok in converted_input_files:
                    python_args_tok.append(
                        str(converted_input_files[tok].docker.absolute()))
                elif tok in converted_output_files:
                    python_args_tok.append(
                        str(converted_output_files[tok].docker.absolute()))
                else:
                    python_args_tok.append(tok)
            python_args = " ".join(python_args_tok)
        else:
            raise RuntimeError(
                f"Cannot handle python_args_or_parameters of type {type(python_args_or_parameters)}. Data: {python_args_or_parameters}"
            )

        # Combine any user requested pre-docker bash with automatic
        # Movement of files from NAS locations to /scratch dir locations
        pre_job_bash = "\n".join(
            chain(
                [
                    f"mkdir -p {scratch_root}",
                    f"cp {str(params_file.nas.absolute())} {str(params_file.scratch.absolute())}"
                    if params_file else "",
                ],
                [
                    f"cp {str(i_file.nas.absolute())} {str(i_file.scratch.absolute())}"
                    for i_file in converted_input_files.values()
                ],
                pre_docker_bash,
            ))

        # Combine any user requested post-docker bash with automatic
        # Movement of files from /scratch locations to NAS locations
        post_job_bash = "\n".join(
            chain(
                [
                    f"cp {str(o_file.scratch.absolute())} {str(o_file.nas.absolute())}"
                    for o_file in converted_output_files.values()
                ],
                post_docker_bash,
            ))

        # Generate the command to run the python job
        python_start = (f"-m {python_module_or_path_on_docker}" if isinstance(
            python_module_or_path_on_docker, str) else
                        str(python_module_or_path_on_docker))
        docker_run_command = (
            f"{python_executable_path_in_docker} {python_start} {python_args}")

        return self.run_container(
            job_name,
            container.name,
            modified_docker_args,
            docker_run_command,
            container.image,
            depends_on=depends_on,
            job_is_stageable=job_is_stageable,
            job_bypass_staging=job_bypass_staging,
            times_to_retry_job=times_to_retry_job,
            job_profiles=job_profiles,
            pre_job_bash=pre_job_bash,
            post_job_bash=post_job_bash,
            category=category,
            resource_request=resource_request,
        )
Exemplo n.º 12
0
    def _run_python_job(
        self,
        job_name: Locator,
        python_module_or_path: Any,
        args_or_params: Union[Parameters, Dict[str, Any], str],
        *,
        depends_on,
        resource_request: Optional[ResourceRequest] = None,
        override_conda_config: Optional[CondaConfiguration] = None,
        category: Optional[str] = None,
        use_pypy: bool = False,
        container: Optional[Container] = None,
        pre_job_bash: str = "",
        post_job_bash: str = "",
        job_is_stageable: bool = False,
        job_bypass_staging: bool = False,
        times_to_retry_job: int = 0,
        job_profiles: Iterable[PegasusProfile] = immutableset(),
        treat_params_as_cmd_args: bool = False,
        input_file_paths: Union[Iterable[Union[Path, str]], Path,
                                str] = immutableset(),
        output_file_paths: Union[Iterable[Union[Path, str]], Path,
                                 str] = immutableset(),
    ) -> DependencyNode:
        """
        Internal function to schedule a python job for centralized logic.
        """
        job_dir = self.directory_for(job_name)
        ckpt_name = job_name / "___ckpt"
        checkpoint_path = job_dir / "___ckpt"
        signature_args = None
        depends_on = _canonicalize_depends_on(depends_on)

        if isinstance(python_module_or_path, (str, Path)):
            computed_module_or_path = python_module_or_path
        else:
            computed_module_or_path = fully_qualified_name(
                python_module_or_path)

        if not isinstance(args_or_params, str):
            # allow users to specify the parameters as a dict for convenience
            if not isinstance(args_or_params, Parameters):
                args_or_params = Parameters.from_mapping(args_or_params)

            params_sink = CharSink.to_string()
            YAMLParametersWriter().write(args_or_params, params_sink)
            signature_args = params_sink.last_string_written

        signature = (
            computed_module_or_path,
            signature_args if signature_args else args_or_params,
        )
        if signature in self._signature_to_job:
            logging.info("Job %s recognized as a duplicate", job_name)
            return self._signature_to_job[signature]

        if container:
            return self._run_python_in_container(
                job_name,
                computed_module_or_path,
                args_or_params,
                container,
                depends_on=depends_on,
                input_files=input_file_paths,
                output_files=output_file_paths,
                resource_request=resource_request,
                category=category,
                pre_docker_bash=pre_job_bash,
                post_docker_bash=post_job_bash,
                job_is_stageable=job_is_stageable,
                job_bypass_staging=job_bypass_staging,
                times_to_retry_job=times_to_retry_job,
                job_profiles=job_profiles,
            )

        script_path = job_dir / "___run.sh"
        stdout_path = job_dir / "___stdout.log"

        self._conda_script_generator.write_shell_script_to(
            entry_point_name=computed_module_or_path,
            parameters=args_or_params,
            working_directory=job_dir,
            script_path=script_path,
            params_path=job_dir / "____params.params",
            stdout_file=stdout_path,
            ckpt_path=checkpoint_path,
            override_conda_config=override_conda_config,
            python="pypy3" if use_pypy else "python",
            pre_job=pre_job_bash,
            post_job=post_job_bash,
            treat_params_as_cmd_args=treat_params_as_cmd_args,
        )

        script_executable = Transformation(
            self._job_name_for(job_name),
            namespace=self._namespace,
            version="4.0",
            site=self._default_site,
            pfn=script_path,
            is_stageable=job_is_stageable,
            bypass_staging=job_bypass_staging,
            arch=Arch.X86_64,
            os_type=OS.LINUX,
            container=container,
        )

        self._transformation_catalog.add_transformations(script_executable)
        resource_request = self.set_resource_request(resource_request)

        job = Job(script_executable)
        dependency_node = self._update_job_settings(
            category,
            checkpoint_path,
            ckpt_name,
            depends_on,
            job,
            job_name,
            job_profiles,
            resource_request,
            times_to_retry_job,
        )
        self._signature_to_job[signature] = dependency_node

        logging.info("Scheduled Python job %s", job_name)
        return dependency_node
    def write_shell_script_to(
        self,
        entry_point_name: Union[str, Path],
        parameters: Union[Path, Parameters, str],
        *,
        working_directory: Path,
        script_path: Path,
        params_path: Optional[Path],
        stdout_file: Optional[Path] = None,
        ckpt_path: Optional[Path] = None,
        override_conda_config: Optional[CondaConfiguration] = None,
        python: str = "python",
        treat_params_as_cmd_args: bool = False,
        pre_job: str = "",
        post_job: str = "",
    ) -> None:
        if isinstance(parameters, Path):
            if params_path:
                raise RuntimeError(
                    "Cannot specify params_path and provide a path for parameters"
                )
            params_path = parameters
        elif isinstance(parameters, Parameters):
            if not params_path:
                raise RuntimeError(
                    "Params path must be specified when providing a parameters object"
                )
            YAMLParametersWriter().write(parameters, CharSink.to_file(params_path))
        elif isinstance(parameters, str):
            if not treat_params_as_cmd_args:
                raise RuntimeError(
                    "Parameters can only be a str when the parameters are being treated as command line args"
                )
        else:
            raise RuntimeError(
                f"Parameters must be either Parameters, path to a param file, "
                f"or a string if treat_params_as_cmd_args is True, "
                f"but got {parameters}"
            )

        if not stdout_file:
            stdout_file = working_directory / "___stdout.log"

        script_path.write_text(
            self.generate_shell_script(
                entry_point_name=entry_point_name
                if isinstance(entry_point_name, str)
                else None,
                python_path=str(entry_point_name.absolute()).strip()
                if isinstance(entry_point_name, Path)
                else None,
                param_file=params_path if not treat_params_as_cmd_args else None,
                cmd_args=parameters.strip() if treat_params_as_cmd_args else None,
                stdout_file=stdout_file,
                working_directory=working_directory,
                ckpt_path=ckpt_path,
                override_conda_config=override_conda_config,
                python=python,
                pre_job=pre_job,
                post_job=post_job,
            ),
            encoding="utf-8",
        )
        # Mark the generated script as executable.
        script_path.chmod(script_path.stat().st_mode | stat.S_IEXEC)
Exemplo n.º 14
0
    def run_python_on_parameters(
        self,
        job_name: Locator,
        python_module: Any,
        parameters: Union[Parameters, Dict[str, Any]],
        *,
        depends_on,
        resource_request: Optional[ResourceRequest] = None,
        override_conda_config: Optional[CondaConfiguration] = None,
        category: Optional[str] = None,
    ) -> DependencyNode:
        """
        Schedule a job to run the given *python_module* on the given *parameters*.

        If this job requires other jobs to be executed first,
        include them in *depends_on*.

        This method returns a `DependencyNode` which can be used in *depends_on*
        for future jobs.
        """
        job_dir = self.directory_for(job_name)
        ckpt_name = job_name / "___ckpt"
        checkpoint_path = job_dir / "___ckpt"

        depends_on = _canonicalize_depends_on(depends_on)
        if isinstance(python_module, str):
            fully_qualified_module_name = python_module
        else:
            fully_qualified_module_name = fully_qualified_name(python_module)

        # allow users to specify the parameters as a dict for convenience
        if not isinstance(parameters, Parameters):
            parameters = Parameters.from_mapping(parameters)

        # If we've already scheduled this identical job,
        # then don't schedule it again.
        params_sink = CharSink.to_string()
        YAMLParametersWriter().write(parameters, params_sink)
        signature = (fully_qualified_module_name,
                     params_sink.last_string_written)
        if signature in self._signature_to_job:
            logging.info("Job %s recognized as a duplicate", job_name)
            return self._signature_to_job[signature]

        script_path = job_dir / "___run.sh"
        stdout_path = parameters.string(
            "logfile", default=str((job_dir / "___stdout.log").absolute()))
        self._conda_script_generator.write_shell_script_to(
            entry_point_name=fully_qualified_module_name,
            parameters=parameters,
            working_directory=job_dir,
            script_path=script_path,
            params_path=job_dir / "____params.params",
            stdout_file=stdout_path,
            ckpt_path=checkpoint_path,
            override_conda_config=override_conda_config,
        )
        script_executable = Executable(
            namespace=self._namespace,
            name=str(job_name).replace("/", "_"),
            version="4.0",
            os="linux",
            arch="x86_64",
        )
        script_executable.addPFN(
            path_to_pfn(script_path, site=self._default_site))
        if not self._job_graph.hasExecutable(script_executable):
            self._job_graph.addExecutable(script_executable)
        job = Job(script_executable)
        self._job_graph.addJob(job)
        for parent_dependency in depends_on:
            if parent_dependency.job:
                self._job_graph.depends(job, parent_dependency.job)
            for out_file in parent_dependency.output_files:
                job.uses(out_file, link=Link.INPUT)

        if resource_request is not None:
            resource_request = self.default_resource_request.unify(
                resource_request)
        else:
            resource_request = self.default_resource_request

        if category:
            job.profile(Namespace.DAGMAN, "category", category)
        resource_request.apply_to_job(job,
                                      job_name=self._job_name_for(job_name))

        # Handle Output Files
        # This is currently only handled as the checkpoint file
        # See: https://github.com/isi-vista/vista-pegasus-wrapper/issues/25
        checkpoint_pegasus_file = path_to_pegasus_file(checkpoint_path,
                                                       site=self._default_site,
                                                       name=f"{ckpt_name}")

        if checkpoint_pegasus_file not in self._added_files:
            self._job_graph.addFile(checkpoint_pegasus_file)
            self._added_files.add(checkpoint_pegasus_file)

        # If the checkpoint file already exists, we want to add it to the replica catalog
        # so that we don't run the job corresponding to the checkpoint file again
        if checkpoint_path.exists():
            with self._replica_catalog.open("a+") as handle:
                handle.write(
                    f"{ckpt_name} file://{checkpoint_path} site={self._default_site}\n"
                )

        job.uses(checkpoint_pegasus_file, link=Link.OUTPUT, transfer=True)

        dependency_node = DependencyNode.from_job(
            job, output_files=[checkpoint_pegasus_file])
        self._signature_to_job[signature] = dependency_node

        logging.info("Scheduled Python job %s", job_name)
        return dependency_node
Exemplo n.º 15
0
 def __exit__(self, exc_type, exc_val, exc_tb) -> None:
     write_doc_id_to_file_map(self.id_to_file, CharSink.to_file(self._path / "_index"))
Exemplo n.º 16
0
 def put(self, key: str, value: str) -> None:
     out_file = self._path / key
     CharSink.to_file(out_file).write(value)
     self.id_to_file[key] = out_file