示例#1
0
def _download(urls: List[str], sha256: Optional[str],
              max_retries: int) -> bytes:
    if not urls:
        raise ValueError("No URLs to download")

    # Cache hit.
    if sha256 and cache_path(f"downloads/{sha256}").is_file():
        with open(str(cache_path(f"downloads/{sha256}")), "rb") as f:
            return f.read()

    # A retry loop, and loop over all urls provided.
    last_exception = None
    wait_time = 10
    for _ in range(max(max_retries, 1)):
        for url in urls:
            try:
                return _do_download_attempt(url, sha256)
            except TooManyRequests as e:
                last_exception = e
                logger.info(
                    "Download attempt failed with Too Many Requests error. "
                    "Watiting %.1f seconds",
                    wait_time,
                )
                sleep(wait_time)
                wait_time *= 1.5
            except DownloadFailed as e:
                logger.info("Download attempt failed: %s", truncate(e))
                last_exception = e
    raise last_exception
示例#2
0
def download_and_unpack_database(db: str, sha256: str) -> Path:
    """Download the given database, unpack it to the local filesystem, and
    return the path.
    """
    local_dir = cache_path(f"state_transition_dataset/{sha256}")
    with _DB_DOWNLOAD_LOCK, InterProcessLock(
        transient_cache_path(".state_transition_database_download.LOCK")
    ):
        if not (local_dir / ".installed").is_file():
            tar_data = io.BytesIO(download(db, sha256))

            local_dir.mkdir(parents=True, exist_ok=True)
            logger.info("Unpacking database to %s ...", local_dir)
            with tarfile.open(fileobj=tar_data, mode="r:bz2") as arc:
                arc.extractall(str(local_dir))

            (local_dir / ".installed").touch()

    unpacked = [f for f in local_dir.iterdir() if f.name != ".installed"]
    if len(unpacked) != 1:
        print(
            f"fatal: Archive {db} expected to contain one file, contains: {len(unpacked)}",
            file=sys.stderr,
        )

    return unpacked[0]
def test_download_cache_hit(mocker):
    """Check that download is not repeated on cache hit."""
    data = b"Hello, world"
    data_checksum = "4ae7c3b6ac0beff671efa8cf57386151c06e58ca53a78d83f36107316cec125f"
    cached_path = cache_path(f"downloads/{data_checksum}")

    # Tidy up from a previous test, if applicable.
    if cached_path.is_file():
        cached_path.unlink()

    def patched_download(*args):
        return data

    mocker.patch.object(download, "_get_url_data", patched_download)
    mocker.spy(download, "_get_url_data")

    assert (
        download.download(
            "example",
            sha256="4ae7c3b6ac0beff671efa8cf57386151c06e58ca53a78d83f36107316cec125f",
        )
        == data
    )
    download._get_url_data.assert_called_once_with("example")
    assert cached_path.is_file()

    # Cache hit.
    assert (
        download.download(
            "example",
            sha256="4ae7c3b6ac0beff671efa8cf57386151c06e58ca53a78d83f36107316cec125f",
        )
        == data
    )
    assert download._get_url_data.call_count == 1
示例#4
0
def download_llvm_files() -> Path:
    """Download and unpack the LLVM data pack."""
    global _LLVM_UNPACKED_LOCATION

    unpacked_location = site_data_path("llvm-v0")
    # Fast path for repeated calls.
    if _LLVM_UNPACKED_LOCATION == unpacked_location:
        return unpacked_location

    with _LLVM_DOWNLOAD_LOCK:
        # Fast path for first call. This check will be repeated inside the locked
        # region if required.
        if (unpacked_location / ".unpacked").is_file():
            _LLVM_UNPACKED_LOCATION = unpacked_location
            return unpacked_location

        with InterProcessLock(cache_path(".llvm-v0-install.LOCK")):
            # Now that the lock is acquired, repeat the check to see if it is
            # necessary to download the dataset.
            if (unpacked_location / ".unpacked").is_file():
                return unpacked_location

            _download_llvm_files(unpacked_location)
            # Create the marker file to indicate that the directory is unpacked
            # and ready to go.
            (unpacked_location / ".unpacked").touch()
            _LLVM_UNPACKED_LOCATION = unpacked_location

        return unpacked_location
示例#5
0
def make_working_dir():
    """Make a working directory for a service. The calling code is responsible for
    removing this directory when done.
    """
    service_directory = cache_path("service")
    timestamp = datetime.now().isoformat()
    random_hash = random.getrandbits(32)
    working_dir = Path(service_directory / f"{timestamp}-{random_hash:08x}")
    (working_dir / "logs").mkdir(parents=True, exist_ok=False)
    return working_dir
示例#6
0
def get_storage_paths() -> List[Path]:
    """Return the list of paths used by CompilerGym for filesystem storage.

    :return: A list of filesystem paths that CompilerGym uses to store files.
    """
    return sorted({
        runfiles_path.cache_path("."),
        runfiles_path.transient_cache_path("."),
        runfiles_path.site_data_path("."),
    })
示例#7
0
def download(url: str, sha256: Optional[str] = None) -> bytes:
    """Download a file and return its contents.

    If :code:`sha256` is provided and the download succeeds, the file contents are cached locally
    in :code:`$cache_path/downloads/$sha256`. See :func:`compiler_gym.cache_path`.

    An inter-process lock ensures that only a single call to this function may
    execute at a time.

    :param url: The URL of the file to download.
    :param sha256: The expected sha256 checksum of the file.
    :return: The contents of the downloaded file.
    :raises OSError: If the download fails, or if the downloaded content does match the expected
        :code:`sha256` checksum.
    """
    # Cache hit.
    if sha256 and cache_path(f"downloads/{sha256}").is_file():
        with open(str(cache_path(f"downloads/{sha256}")), "rb") as f:
            return f.read()

    logging.info(f"Downloading {url} ...")
    content = _download(url)
    if sha256:
        # Validate the checksum.
        checksum = hashlib.sha256()
        checksum.update(content)
        actual_sha256 = checksum.hexdigest()
        if sha256 != actual_sha256:
            raise OSError(f"Checksum of downloaded dataset does not match:\n"
                          f"Url: {url}\n"
                          f"Expected: {sha256}\n"
                          f"Actual:   {actual_sha256}")

        # Cache the downloaded file.
        cache_path("downloads").mkdir(parents=True, exist_ok=True)
        with open(str(cache_path(f"downloads/{sha256}")), "wb") as f:
            f.write(content)

    logging.info(f"Downloaded {url}")
    return content
示例#8
0
    def __init__(self):
        self.path = _create_timestamped_unique_service_dir(
            transient_cache_path("."))
        (self.path / "logs").mkdir()

        self._directories_to_remove = [self.path]

        if is_in_memory(self.path):
            disk = _create_timestamped_unique_service_dir(cache_path("."))
            self._directories_to_remove.append(disk)
            os.symlink(disk, self.path / "disk")
        else:
            (self.path / "disk").mkdir()
示例#9
0
def download(urls: Union[str, List[str]],
             sha256: Optional[str] = None,
             max_retries: int = 5) -> bytes:
    """Download a file and return its contents.

    If :code:`sha256` is provided and the download succeeds, the file contents
    are cached locally in :code:`$cache_path/downloads/$sha256`. See
    :func:`compiler_gym.cache_path`.

    An inter-process lock ensures that only a single call to this function may
    execute at a time.

    :param urls: Either a single URL of the file to download, or a list of URLs
        to download.

    :param sha256: The expected sha256 checksum of the file.

    :return: The contents of the downloaded file.

    :raises IOError: If the download fails, or if the downloaded content does
        match the expected :code:`sha256` checksum.
    """
    # Convert a singular string into a list of strings.
    urls = [urls] if not isinstance(urls, list) else urls

    # Only a single process may download a file at a time. The idea here is to
    # prevent redundant downloads when multiple simultaneous processes all try
    # and download the same resource. If we don't have an ID for the resource
    # then we just lock globally to reduce NIC thrashing.
    if sha256:
        with fasteners.InterProcessLock(
                cache_path(f"downloads/.{sha256}.lock")):
            return _download(urls, sha256, max_retries)
    else:
        with fasteners.InterProcessLock(cache_path("downloads/.lock")):
            return _download(urls, None, max_retries)
示例#10
0
def _do_download_attempt(url: str, sha256: Optional[str]) -> bytes:
    logger.info("Downloading %s ...", url)
    content = _get_url_data(url)
    if sha256:
        # Validate the checksum.
        checksum = hashlib.sha256()
        checksum.update(content)
        actual_sha256 = checksum.hexdigest()
        if sha256 != actual_sha256:
            raise DownloadFailed(f"Checksum of download does not match:\n"
                                 f"Url: {url}\n"
                                 f"Expected: {sha256}\n"
                                 f"Actual:   {actual_sha256}")

        # Cache the downloaded file.
        path = cache_path(f"downloads/{sha256}")
        path.parent.mkdir(parents=True, exist_ok=True)
        with atomic_file_write(path, fileobj=True) as f:
            f.write(content)

    logger.debug(f"Downloaded {url}")
    return content
示例#11
0
def _download(url: str) -> bytes:
    req = requests.get(url)
    try:
        if req.status_code != 200:
            raise OSError(f"GET returned status code {req.status_code}: {url}")

        return req.content
    finally:
        req.close()


# Only a single process may download at a time. The idea here is to prevent
# overloading the NIC when, for example, you launch a bunch of simultaneous
# learning processes which all require the same dataset.
@fasteners.interprocess_locked(cache_path("downloads/LOCK"))
def download(url: str, sha256: Optional[str] = None) -> bytes:
    """Download a file and return its contents.

    If :code:`sha256` is provided and the download succeeds, the file contents are cached locally
    in :code:`$cache_path/downloads/$sha256`. See :func:`compiler_gym.cache_path`.

    An inter-process lock ensures that only a single call to this function may
    execute at a time.

    :param url: The URL of the file to download.
    :param sha256: The expected sha256 checksum of the file.
    :return: The contents of the downloaded file.
    :raises OSError: If the download fails, or if the downloaded content does match the expected
        :code:`sha256` checksum.
    """
示例#12
0
def make_benchmark(
    inputs: Union[str, Path, ClangInvocation, List[Union[str, Path, ClangInvocation]]],
    copt: Optional[List[str]] = None,
    system_includes: bool = True,
    timeout: int = 600,
) -> Benchmark:
    """Create a benchmark for use by LLVM environments.

    This function takes one or more inputs and uses them to create a benchmark
    that can be passed to :meth:`compiler_gym.envs.LlvmEnv.reset`.

    For single-source C/C++ programs, you can pass the path of the source file:

    >>> benchmark = make_benchmark('my_app.c')
    >>> env = gym.make("llvm-v0")
    >>> env.reset(benchmark=benchmark)

    The clang invocation used is roughly equivalent to:

    .. code-block::

        $ clang my_app.c -O0 -c -emit-llvm -o benchmark.bc

    Additional compile-time arguments to clang can be provided using the
    :code:`copt` argument:

    >>> benchmark = make_benchmark('/path/to/my_app.cpp', copt=['-O2'])

    If you need more fine-grained control over the options, you can directly
    construct a :class:`ClangInvocation <compiler_gym.envs.llvm.ClangInvocation>`
    to pass a list of arguments to clang:

    >>> benchmark = make_benchmark(
        ClangInvocation(['/path/to/my_app.c'], timeout=10)
    )

    For multi-file programs, pass a list of inputs that will be compiled
    separately and then linked to a single module:

    >>> benchmark = make_benchmark([
        'main.c',
        'lib.cpp',
        'lib2.bc',
    ])

    If you already have prepared bitcode files, those can be linked and used
    directly:

    >>> benchmark = make_benchmark([
        'bitcode1.bc',
        'bitcode2.bc',
    ])

    .. note::
        LLVM bitcode compatibility is
        `not guaranteed <https://llvm.org/docs/DeveloperPolicy.html#ir-backwards-compatibility>`_,
        so you must ensure that any precompiled bitcodes are compatible with the
        LLVM version used by CompilerGym, which can be queried using
        :func:`LlvmEnv.compiler_version <compiler_gym.envs.CompilerEnv.compiler_version>`.

    :param inputs: An input, or list of inputs.
    :param copt: A list of command line options to pass to clang when compiling
        source files.
    :param system_includes: Whether to include the system standard libraries
        during compilation jobs. This requires a system toolchain. See
        :func:`get_system_includes`.
    :param timeout: The maximum number of seconds to allow clang to run before
        terminating.
    :return: A :code:`Benchmark` message.
    :raises FileNotFoundError: If any input sources are not found.
    :raises TypeError: If the inputs are of unsupported types.
    :raises OSError: If a compilation job fails.
    :raises TimeoutExpired: If a compilation job exceeds :code:`timeout` seconds.
    """
    copt = copt or []

    bitcodes: List[Path] = []
    clang_jobs: List[ClangInvocation] = []

    def _add_path(path: Path):
        # NOTE(cummins): There is some discussion about the best way to create
        # a bitcode that is unoptimized yet does not hinder downstream
        # optimization opportunities. Here we are using a configuration based
        # on -O0, yet there is a suggestion that an optimized configuration
        # can produce better results if the optimizations themselves are
        # explicitly disabled, as in: ["-Oz", "-Xclang", "-disable-llvm-optzns"]
        # See: https://lists.llvm.org/pipermail/llvm-dev/2018-August/thread.html#125365
        DEFAULT_COPT = [
            "-O",
            "-Xclang",
            "-disable-O0-optnone",
            "-Xclang",
            "-disable-llvm-passes",
        ]

        if not path.is_file():
            raise FileNotFoundError(path)

        if path.suffix == ".bc":
            bitcodes.append(path)
        elif path.suffix in {".c", ".cxx", ".cpp", ".cc"}:
            clang_jobs.append(
                ClangInvocation(
                    [str(path)] + DEFAULT_COPT + copt,
                    system_includes=system_includes,
                    timeout=timeout,
                )
            )
        else:
            raise ValueError(f"Unrecognized file type: {path.name}")

    # Determine from inputs the list of pre-compiled bitcodes and the clang
    # invocations required to compile the bitcodes.
    if isinstance(inputs, str) or isinstance(inputs, Path):
        _add_path(Path(inputs))
    elif isinstance(inputs, ClangInvocation):
        clang_jobs.append(inputs)
    else:
        for input in inputs:
            if isinstance(input, str) or isinstance(input, Path):
                _add_path(Path(input))
            elif isinstance(input, ClangInvocation):
                clang_jobs.append(input)
            else:
                raise TypeError(f"Invalid input type: {type(input).__name__}")

    if not bitcodes and not clang_jobs:
        raise ValueError("No inputs")

    # Shortcut if we only have a single pre-compiled bitcode.
    if len(bitcodes) == 1 and not clang_jobs:
        bitcode = bitcodes[0]
        return Benchmark(
            uri=f"file:///{bitcode}", program=File(uri=f"file:///{bitcode}")
        )

    with tempfile.TemporaryDirectory(dir=cache_path(".")) as d:
        working_dir = Path(d)

        # Run the clang invocations in parallel.
        clang_outs = [
            working_dir / f"out-{i}.bc" for i in range(1, len(clang_jobs) + 1)
        ]
        clang_cmds = [
            (job.command(out), job.timeout) for job, out in zip(clang_jobs, clang_outs)
        ]
        with multiprocessing.Pool() as pool:
            list(pool.imap_unordered(_run_command, clang_cmds))

        # Check that the expected files were generated.
        for i, b in enumerate(clang_outs):
            if not b.is_file():
                raise OSError(
                    f"Clang invocation failed to produce a file: {' '.join(clang_cmds[i])}"
                )

        if len(bitcodes + clang_outs) > 1:
            # Link all of the bitcodes into a single module.
            llvm_link_cmd = [str(LLVM_LINK), "-o", "-"] + [
                str(path) for path in bitcodes + clang_outs
            ]
            llvm_link = subprocess.Popen(
                llvm_link_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
            )
            bitcode, stderr = _communicate(llvm_link, timeout=timeout)
            if llvm_link.returncode:
                raise OSError(
                    f"Failed to link LLVM bitcodes with error: {stderr.decode('utf-8')}"
                )
        else:
            # We only have a single bitcode so read it.
            with open(str(list(bitcodes + clang_outs)[0]), "rb") as f:
                bitcode = f.read()

    timestamp = datetime.now().strftime(f"%Y%m%HT%H%M%S-{random.randrange(16**4):04x}")
    return Benchmark(
        uri=f"benchmark://user/{timestamp}", program=File(contents=bitcode)
    )
示例#13
0
        )
    finally:
        binary.unlink()

    if process.returncode:
        try:
            output = stdout.decode("utf-8")
            msg = f"Benchmark exited with returncode {process.returncode}. Output: {output}"
        except UnicodeDecodeError:
            msg = f"Benchmark exited with returncode {process.returncode}"
        return BenchmarkExecutionResult(walltime_seconds=timer.time, error=msg)

    return BenchmarkExecutionResult(walltime_seconds=timer.time, output=stdout)


@fasteners.interprocess_locked(cache_path("cBench-v0-runtime-data.LOCK"))
def download_cBench_runtime_data() -> bool:
    """Download and unpack the cBench runtime dataset."""
    if _CBENCH_DATA.is_dir():
        return False
    else:
        tar_contents = io.BytesIO(
            download(_CBENCH_DATA_URL, sha256=_CBENCH_DATA_SHA256))
        with tarfile.open(fileobj=tar_contents, mode="r:bz2") as tar:
            _CBENCH_DATA.parent.mkdir(parents=True)
            tar.extractall(_CBENCH_DATA.parent)
        assert _CBENCH_DATA.is_dir()
        return True


def _make_cBench_validator(
示例#14
0
 def install(self):
     super().install()
     with _CBENCH_DOWNLOAD_THREAD_LOCK:
         with fasteners.InterProcessLock(
                 cache_path(".cbench-v1-runtime-data.LOCK")):
             download_cBench_runtime_data()
示例#15
0
    def validator_cb(
            env: "LlvmEnv") -> Optional[ValidationError]:  # noqa: F821
        """The validation callback."""
        with _CBENCH_DOWNLOAD_THREAD_LOCK:
            with fasteners.InterProcessLock(
                    cache_path(".cbench-v1-runtime-data.LOCK")):
                download_cBench_runtime_data()

        cbench_data = site_data_path(
            "llvm-v0/cbench-v1-runtime-data/runtime_data")
        for input_file_name in input_files:
            path = cbench_data / input_file_name
            if not path.is_file():
                raise FileNotFoundError(
                    f"Required benchmark input not found: {path}")

        # Create a temporary working directory to execute the benchmark in.
        with tempfile.TemporaryDirectory(
                dir=env.service.connection.working_dir) as d:
            cwd = Path(d)

            # Expand shell variable substitutions in the benchmark command.
            expanded_command = cmd.replace("$D", str(cbench_data))

            # Translate the output file names into paths inside the working
            # directory.
            output_paths = [cwd / o for o in output_files]

            if pre_execution_callback:
                pre_execution_callback(cwd)

            # Produce a gold-standard output using a reference version of
            # the benchmark.
            if compare_output or output_files:
                gs_env = env.fork()
                try:
                    # Reset to the original benchmark state and compile it.
                    gs_env.reset(benchmark=env.benchmark)
                    gs_env.write_bitcode(cwd / "benchmark.bc")
                    gold_standard = _compile_and_run_bitcode_file(
                        bitcode_file=cwd / "benchmark.bc",
                        cmd=expanded_command,
                        cwd=cwd,
                        num_runs=1,
                        # Use default optimizations for gold standard.
                        linkopts=linkopts + ["-O2"],
                        # Always assume safe.
                        sanitizer=None,
                        env=os_env,
                    )
                    if gold_standard.error:
                        return ValidationError(
                            type=f"Gold standard: {gold_standard.error.type}",
                            data=gold_standard.error.data,
                        )
                finally:
                    gs_env.close()

                # Check that the reference run produced the expected output
                # files.
                for path in output_paths:
                    if not path.is_file():
                        try:
                            output = gold_standard.output
                        except UnicodeDecodeError:
                            output = "<binary>"
                        raise FileNotFoundError(
                            f"Expected file '{path.name}' not generated\n"
                            f"Benchmark: {env.benchmark}\n"
                            f"Command: {cmd}\n"
                            f"Output: {output}")
                    path.rename(f"{path}.gold_standard")

            # Serialize the benchmark to a bitcode file that will then be
            # compiled to a binary.
            env.write_bitcode(cwd / "benchmark.bc")
            outcome = _compile_and_run_bitcode_file(
                bitcode_file=cwd / "benchmark.bc",
                cmd=expanded_command,
                cwd=cwd,
                num_runs=num_runs,
                linkopts=linkopts,
                sanitizer=sanitizer,
                env=os_env,
            )

            if outcome.error:
                return outcome.error

            # Run a user-specified validation hook.
            if validate_result:
                validate_result(outcome)

            # Difftest the console output.
            if compare_output and gold_standard.output != outcome.output:
                return ValidationError(
                    type="Wrong output",
                    data={
                        "expected": gold_standard.output,
                        "actual": outcome.output
                    },
                )

            # Difftest the output files.
            for path in output_paths:
                if not path.is_file():
                    return ValidationError(
                        type="Output not generated",
                        data={
                            "path": path.name,
                            "command": cmd
                        },
                    )
                diff = subprocess.Popen(
                    ["diff", str(path), f"{path}.gold_standard"],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                )
                stdout, _ = diff.communicate()
                if diff.returncode:
                    try:
                        stdout = stdout.decode("utf-8")
                        return ValidationError(
                            type="Wrong output (file)",
                            data={
                                "path": path.name,
                                "diff": stdout
                            },
                        )
                    except UnicodeDecodeError:
                        return ValidationError(
                            type="Wrong output (file)",
                            data={
                                "path": path.name,
                                "diff": "<binary>"
                            },
                        )