def get_link_flags(): """Get the link flags for custom operators. Returns: The link flags. """ flags = [] if not _MONOLITHIC_BUILD: flags.append('-L%s' % get_lib()) flags.append('-l:libtensorflow_framework.so.%s' % _VERSION.split('.')[0]) return flags
def get_link_flags(): """Get the link flags for custom operators. Returns: The link flags. """ is_mac = _platform.system() == 'Darwin' ver = _VERSION.split('.')[0] flags = [] if not _MONOLITHIC_BUILD: flags.append('-L%s' % get_lib()) if is_mac: flags.append('-ltensorflow_framework.%s' % ver) else: flags.append('-l:libtensorflow_framework.so.%s' % ver) return flags
def get_link_flags(): """Get the link flags for custom operators. Returns: The link flags. """ is_mac = _platform.system() == 'Darwin' ver = _VERSION.split('.')[0] flags = [] if not _MONOLITHIC_BUILD: flags.append('-L%s' % get_lib()) if is_mac: flags.append('-l:libtensorflow_framework.%s.dylib' % ver) else: flags.append('-l:libtensorflow_framework.so.%s' % ver) return flags
# limitations under the License. # coding: utf-8 # pylint: disable=protected-access import time from tensorflow.python.client import session from tensorflow.python.framework import meta_graph, ops from tensorflow.python.framework.versions import VERSION from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import checkpoint_management, session_manager from tensorflow.python.training.basic_session_run_hooks \ import CheckpointSaverHook assert VERSION.startswith("1.15."), "Monkey patch is only valid for TF 1.15." def new_restore_checkpoint(self, master, saver=None, checkpoint_dir=None, checkpoint_filename_with_path=None, wait_for_checkpoint=False, max_wait_secs=7200, config=None): """Creates a `Session`, and tries to restore a checkpoint if needed. Args: master: `String` representation of the TensorFlow master to use. saver: A `Saver` object used to restore a model.
def _create_request_dict( job_id, region, image_uri, chief_config, worker_count, worker_config, entry_point_args, job_labels={}, ): """Creates request dictionary for the CAIP training service. Args: job_id: String, unique job id. region: GCP region name. image_uri: The docker image uri. chief_config: `MachineConfig` that represents the configuration for the chief worker in a distribution cluster. worker_count: Integer that represents the number of general workers in a distribution cluster. This count does not include the chief worker. worker_config: `MachineConfig` that represents the configuration for the general workers in a distribution cluster. entry_point_args: Command line arguments to pass to the `entry_point` program. job_labels: Dict of str: str. Labels to organize jobs. See https://cloud.google.com/ai-platform/training/docs/resource-labels. Returns: The job request dictionary. """ training_input = {} training_input["region"] = region training_input["scaleTier"] = "custom" training_input["masterType"] = gcp.get_machine_type( chief_config.cpu_cores, chief_config.memory, chief_config.accelerator_type) # Set master config chief_machine_config = {} chief_machine_config["imageUri"] = image_uri chief_machine_config["acceleratorConfig"] = {} chief_machine_config["acceleratorConfig"]["count"] = str( chief_config.accelerator_count) chief_machine_config["acceleratorConfig"][ "type"] = gcp.get_accelerator_type(chief_config.accelerator_type.value) training_input["masterConfig"] = chief_machine_config training_input["workerCount"] = str(worker_count) if worker_count > 0: training_input["workerType"] = gcp.get_machine_type( worker_config.cpu_cores, worker_config.memory, worker_config.accelerator_type, ) worker_machine_config = {} worker_machine_config["imageUri"] = image_uri worker_machine_config["acceleratorConfig"] = {} worker_machine_config["acceleratorConfig"]["count"] = str( worker_config.accelerator_count) worker_machine_config["acceleratorConfig"][ "type"] = gcp.get_accelerator_type( worker_config.accelerator_type.value) if machine_config.is_tpu_config(worker_config): # AI Platform runtime version spec is required for training # on cloud TPUs. v = VERSION.split(".") worker_machine_config["tpuTfVersion"] = v[0] + "." + v[1] training_input["workerConfig"] = worker_machine_config if entry_point_args is not None: training_input["args"] = entry_point_args # This is temporarily required so that the `TF_CONFIG` generated by # CAIP uses the keyword 'chief' instead of 'master'. training_input["use_chief_in_tf_config"] = True request_dict = {} request_dict["jobId"] = job_id request_dict["trainingInput"] = training_input if job_labels: request_dict["labels"] = job_labels return request_dict
def _create_docker_file(self): """Creates a Dockerfile.""" if self.docker_base_image is None: # Updating the name for RC's to match with the TF generated RC docker image names. tf_version = VERSION.replace("-rc", "rc") # Get the TF docker base image to use based on the current # TF version. self.docker_base_image = "tensorflow/tensorflow:{}".format(tf_version) if ( self.chief_config.accelerator_type != machine_config.AcceleratorType.NO_ACCELERATOR ): self.docker_base_image += "-gpu" # Add python 3 tag for TF version <= 2.1.0 # https://hub.docker.com/r/tensorflow/tensorflow if VERSION != "latest": v = VERSION.split(".") if float(v[0] + "." + v[1]) <= 2.1: self.docker_base_image += "-py3" if not self._base_image_exist(): warnings.warn( "Docker base image {} does not exist.".format(self.docker_base_image) ) if "dev" in self.docker_base_image: # Except for the latest TF nightly, other nightlies # do not have corresponding docker images. newtag = "nightly" if self.docker_base_image.endswith("-gpu"): newtag += "-gpu" self.docker_base_image = ( self.docker_base_image.split(":")[0] + ":" + newtag ) warnings.warn("Using the latest TF nightly build.") else: warnings.warn( "Using the latest stable TF docker image available: " "`tensorflow/tensorflow:latest`" "Please see https://hub.docker.com/r/tensorflow/tensorflow/ " "for details on available docker images." ) newtag = "tensorflow/tensorflow:latest" if self.docker_base_image.endswith("-gpu"): newtag += "-gpu" self.docker_base_image = newtag lines = [ "FROM {}".format(self.docker_base_image), "WORKDIR {}".format(self.destination_dir), ] if self.requirements_txt is not None: _, requirements_txt_name = os.path.split(self.requirements_txt) dst_requirements_txt = os.path.join(requirements_txt_name) requirements_txt_path = os.path.join( self.destination_dir, requirements_txt_name ) lines.append( "COPY {} {}".format(requirements_txt_path, requirements_txt_path) ) # install pip requirements from requirements_txt if it exists. lines.append( "RUN if [ -e {} ]; " "then pip install --no-cache -r {}; " "fi".format(dst_requirements_txt, dst_requirements_txt) ) if self.entry_point is None: lines.append("RUN pip install tensorflow-cloud") if self.worker_config is not None and machine_config.is_tpu_config( self.worker_config ): lines.append("RUN pip install cloud-tpu-client") # Copies the files from the `destination_dir` in docker daemon location # to the `destination_dir` in docker container filesystem. lines.append("COPY {} {}".format(self.destination_dir, self.destination_dir)) docker_entry_point = self.preprocessed_entry_point or self.entry_point _, docker_entry_point_file_name = os.path.split(docker_entry_point) # Using `ENTRYPOINT` here instead of `CMD` specifically because # we want to support passing user code flags. lines.extend( ['ENTRYPOINT ["python", "{}"]'.format(docker_entry_point_file_name)] ) content = "\n".join(lines) _, self.docker_file_path = tempfile.mkstemp() with open(self.docker_file_path, "w") as f: f.write(content)