Exemplo n.º 1
0
def get_link_flags():
  """Get the link flags for custom operators.

  Returns:
    The link flags.
  """
  flags = []
  if not _MONOLITHIC_BUILD:
    flags.append('-L%s' % get_lib())
    flags.append('-l:libtensorflow_framework.so.%s' % _VERSION.split('.')[0])
  return flags
Exemplo n.º 2
0
def get_link_flags():
    """Get the link flags for custom operators.

  Returns:
    The link flags.
  """
    is_mac = _platform.system() == 'Darwin'
    ver = _VERSION.split('.')[0]
    flags = []
    if not _MONOLITHIC_BUILD:
        flags.append('-L%s' % get_lib())
        if is_mac:
            flags.append('-ltensorflow_framework.%s' % ver)
        else:
            flags.append('-l:libtensorflow_framework.so.%s' % ver)
    return flags
Exemplo n.º 3
0
def get_link_flags():
  """Get the link flags for custom operators.

  Returns:
    The link flags.
  """
  is_mac = _platform.system() == 'Darwin'
  ver = _VERSION.split('.')[0]
  flags = []
  if not _MONOLITHIC_BUILD:
    flags.append('-L%s' % get_lib())
    if is_mac:
      flags.append('-l:libtensorflow_framework.%s.dylib' % ver)
    else:
      flags.append('-l:libtensorflow_framework.so.%s' % ver)
  return flags
Exemplo n.º 4
0
# limitations under the License.

# coding: utf-8
# pylint: disable=protected-access

import time

from tensorflow.python.client import session
from tensorflow.python.framework import meta_graph, ops
from tensorflow.python.framework.versions import VERSION
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.training import checkpoint_management, session_manager
from tensorflow.python.training.basic_session_run_hooks \
    import CheckpointSaverHook

assert VERSION.startswith("1.15."), "Monkey patch is only valid for TF 1.15."


def new_restore_checkpoint(self,
                           master,
                           saver=None,
                           checkpoint_dir=None,
                           checkpoint_filename_with_path=None,
                           wait_for_checkpoint=False,
                           max_wait_secs=7200,
                           config=None):
    """Creates a `Session`, and tries to restore a checkpoint if needed.

    Args:
        master: `String` representation of the TensorFlow master to use.
        saver: A `Saver` object used to restore a model.
Exemplo n.º 5
0
def _create_request_dict(
    job_id,
    region,
    image_uri,
    chief_config,
    worker_count,
    worker_config,
    entry_point_args,
    job_labels={},
):
    """Creates request dictionary for the CAIP training service.

    Args:
        job_id: String, unique job id.
        region: GCP region name.
        image_uri: The docker image uri.
        chief_config: `MachineConfig` that represents the configuration for
            the chief worker in a distribution cluster.
        worker_count: Integer that represents the number of general workers
            in a distribution cluster. This count does not include the
            chief worker.
        worker_config: `MachineConfig` that represents the configuration for
            the general workers in a distribution cluster.
        entry_point_args: Command line arguments to pass to the
            `entry_point` program.
        job_labels: Dict of str: str. Labels to organize jobs. See 
            https://cloud.google.com/ai-platform/training/docs/resource-labels.

    Returns:
        The job request dictionary.
    """
    training_input = {}
    training_input["region"] = region
    training_input["scaleTier"] = "custom"
    training_input["masterType"] = gcp.get_machine_type(
        chief_config.cpu_cores, chief_config.memory,
        chief_config.accelerator_type)

    # Set master config
    chief_machine_config = {}
    chief_machine_config["imageUri"] = image_uri
    chief_machine_config["acceleratorConfig"] = {}
    chief_machine_config["acceleratorConfig"]["count"] = str(
        chief_config.accelerator_count)
    chief_machine_config["acceleratorConfig"][
        "type"] = gcp.get_accelerator_type(chief_config.accelerator_type.value)

    training_input["masterConfig"] = chief_machine_config
    training_input["workerCount"] = str(worker_count)

    if worker_count > 0:
        training_input["workerType"] = gcp.get_machine_type(
            worker_config.cpu_cores,
            worker_config.memory,
            worker_config.accelerator_type,
        )

        worker_machine_config = {}
        worker_machine_config["imageUri"] = image_uri
        worker_machine_config["acceleratorConfig"] = {}
        worker_machine_config["acceleratorConfig"]["count"] = str(
            worker_config.accelerator_count)
        worker_machine_config["acceleratorConfig"][
            "type"] = gcp.get_accelerator_type(
                worker_config.accelerator_type.value)

        if machine_config.is_tpu_config(worker_config):
            # AI Platform runtime version spec is required for training
            # on cloud TPUs.
            v = VERSION.split(".")
            worker_machine_config["tpuTfVersion"] = v[0] + "." + v[1]
        training_input["workerConfig"] = worker_machine_config

    if entry_point_args is not None:
        training_input["args"] = entry_point_args

    # This is temporarily required so that the `TF_CONFIG` generated by
    # CAIP uses the keyword 'chief' instead of 'master'.
    training_input["use_chief_in_tf_config"] = True
    request_dict = {}
    request_dict["jobId"] = job_id
    request_dict["trainingInput"] = training_input
    if job_labels:
        request_dict["labels"] = job_labels
    return request_dict
Exemplo n.º 6
0
    def _create_docker_file(self):
        """Creates a Dockerfile."""
        if self.docker_base_image is None:
            # Updating the name for RC's to match with the TF generated RC docker image names.
            tf_version = VERSION.replace("-rc", "rc")
            # Get the TF docker base image to use based on the current
            # TF version.
            self.docker_base_image = "tensorflow/tensorflow:{}".format(tf_version)
            if (
                self.chief_config.accelerator_type
                != machine_config.AcceleratorType.NO_ACCELERATOR
            ):
                self.docker_base_image += "-gpu"

            # Add python 3 tag for TF version <= 2.1.0
            # https://hub.docker.com/r/tensorflow/tensorflow
            if VERSION != "latest":
                v = VERSION.split(".")
                if float(v[0] + "." + v[1]) <= 2.1:
                    self.docker_base_image += "-py3"

        if not self._base_image_exist():
            warnings.warn(
                "Docker base image {} does not exist.".format(self.docker_base_image)
            )
            if "dev" in self.docker_base_image:
                # Except for the latest TF nightly, other nightlies
                # do not have corresponding docker images.
                newtag = "nightly"
                if self.docker_base_image.endswith("-gpu"):
                    newtag += "-gpu"
                self.docker_base_image = (
                    self.docker_base_image.split(":")[0] + ":" + newtag
                )
                warnings.warn("Using the latest TF nightly build.")
            else:
                warnings.warn(
                    "Using the latest stable TF docker image available: "
                    "`tensorflow/tensorflow:latest`"
                    "Please see https://hub.docker.com/r/tensorflow/tensorflow/ "
                    "for details on available docker images."
                )
                newtag = "tensorflow/tensorflow:latest"
                if self.docker_base_image.endswith("-gpu"):
                    newtag += "-gpu"
                self.docker_base_image = newtag

        lines = [
            "FROM {}".format(self.docker_base_image),
            "WORKDIR {}".format(self.destination_dir),
        ]

        if self.requirements_txt is not None:
            _, requirements_txt_name = os.path.split(self.requirements_txt)
            dst_requirements_txt = os.path.join(requirements_txt_name)
            requirements_txt_path = os.path.join(
                self.destination_dir, requirements_txt_name
            )
            lines.append(
                "COPY {} {}".format(requirements_txt_path, requirements_txt_path)
            )
            # install pip requirements from requirements_txt if it exists.
            lines.append(
                "RUN if [ -e {} ]; "
                "then pip install --no-cache -r {}; "
                "fi".format(dst_requirements_txt, dst_requirements_txt)
            )
        if self.entry_point is None:
            lines.append("RUN pip install tensorflow-cloud")

        if self.worker_config is not None and machine_config.is_tpu_config(
            self.worker_config
        ):
            lines.append("RUN pip install cloud-tpu-client")

        # Copies the files from the `destination_dir` in docker daemon location
        # to the `destination_dir` in docker container filesystem.
        lines.append("COPY {} {}".format(self.destination_dir, self.destination_dir))

        docker_entry_point = self.preprocessed_entry_point or self.entry_point
        _, docker_entry_point_file_name = os.path.split(docker_entry_point)

        # Using `ENTRYPOINT` here instead of `CMD` specifically because
        # we want to support passing user code flags.
        lines.extend(
            ['ENTRYPOINT ["python", "{}"]'.format(docker_entry_point_file_name)]
        )

        content = "\n".join(lines)
        _, self.docker_file_path = tempfile.mkstemp()
        with open(self.docker_file_path, "w") as f:
            f.write(content)