예제 #1
0
    def start(self, project_id, resume=False, cluster=None):
        """
        Starts the Experiment in the cloud (using kubectl).
        The respective cluster is started (if it's not already running).

        Args:
            project_id (str): The remote gcloud project-ID.
            resume (bool): Whether we are resuming an already started (and paused) experiment.
            cluster (str): The name of the cluster to use (will be started if not already running). None for
                using the Experiment's own cluster or - if not given either - a default cluster.
        """

        # Update our cluster spec
        cluster = self.setup_cluster(cluster,
                                     project_id,
                                     start=False if resume else True)
        # Rewrite our json file.
        self.status = "running"
        self.write_json_file(file=self.path + self.running_json_file)

        # Render the k8s yaml config file for the experiment.
        print("+ Generating experiment's k8s config file.")
        #gpus_per_container = 0
        if self.run_mode == "distributed":
            gpus_per_container = int(
                cluster.num_gpus /
                (self.num_workers + self.num_parameter_servers))
        else:
            gpus_per_container = cluster.gpus_per_node
        util.write_kubernetes_yaml_file(self, self.k8s_config,
                                        gpus_per_container)
        print("+ Deleting old Kubernetes Workloads.")
        _ = util.syscall("kubectl delete -f {}".format(self.k8s_config),
                         return_outputs="as_str")

        # TODO: wipe out previous experiments' results

        # Copy all required files to all nodes' disks.
        print("+ Copying all necessary config files to all nodes ...")

        # - create /experiment directory on primary disk
        # - change permissions on the experiment's folder
        # - copy experiment-running config file into /experiment directory
        cluster.ssh_parallel(
            "sudo mount --make-shared /mnt/stateful_partition/",  # make partition shared
            "sudo mkdir /mnt/stateful_partition/experiment/ ; "  # create experiment dir
            "sudo chmod -R 0777 /mnt/stateful_partition/experiment/",  # make writable
            # copy experiment's json file into new dir
            [
                self.path + self.running_json_file,
                "_NODE_:/mnt/stateful_partition/experiment/."
            ],
            silent=False)

        # Create kubernetes services (which will start the experiment).
        print("+ Creating new Kubernetes Services and ReplicaSets.")
        util.syscall("kubectl create -f {}".format(self.k8s_config))
예제 #2
0
 def delete(self):
     """
     Deletes (shuts down) this cluster in the cloud.
     """
     # delete the named cluster
     # don't wait for operation to finish
     print("+ Deleting cluster {} (async).".format(self.name_hyphenated))
     util.syscall(
         "gcloud container clusters delete {} --quiet --async".format(
             self.name))
     self.started = False
     self.deleted = True
예제 #3
0
    def stop(self, no_download=False):
        """
        Stops an already running Experiment by deleting the Kubernetes workload. If no_download is set to False
        (default), will download all results before stopping. If the cluster that the experiment runs on
        is dedicated to this experiment, will also delete the cluster.

        Args:
            no_download (bool): Whether to not(!) download the experiment's results so far (default: False).
        """

        # download data before stopping
        if not no_download:
            self.download()
        if self.status == "stopped":
            warn(
                "WARNING: Experiment seems to be stopped already. Trying anyway. ..."
            )
        # figure out whether cluster was created along with experiment
        # if yes: shut down cluster
        if self.has_dedicated_cluster:
            cluster = get_cluster_from_string(self.cluster.get("name"))
            print("+ Shutting down experiment's cluster {}.".format(
                cluster.name_hyphenated))
            cluster.delete()
        # if not: simply stop k8s jobs
        else:
            print("+ Deleting Kubernetes Workloads.")
            _ = util.syscall("kubectl delete -f {}".format(self.k8s_config),
                             return_outputs=True)

        self.status = "stopped"
        self.write_json_file(file=self.path + self.running_json_file)
예제 #4
0
    def pause(self, project_id):
        """
        Pauses the already running Experiment.

        Args:
            project_id (str): The remote gcloud project-ID.
        """
        _ = self.setup_cluster(cluster=None, project_id=project_id)
        # delete the kubernetes workloads
        print("+ Deleting Kubernetes Workloads.")
        util.syscall("kubectl delete -f {}".format(self.k8s_config))

        self.status = "paused"
        self.write_json_file(file=self.path + self.running_json_file)

        print(
            "+ Experiment is paused. Resume with `experiment start --resume -e {}`."
            .format(self.name_hyphenated))
예제 #5
0
 def _ssh_parallel_target(self, node, silent, items):
     for item in items:
         # an ssh command to execute on the node
         if isinstance(item, str):
             _ = util.syscall(
                 "gcloud compute ssh {} {} --command \"{}\"".format(
                     node,
                     "--zone=" + self.location if self.location else "",
                     item),
                 return_outputs=silent)
         # an scp command (copy from ... to ...)
         elif isinstance(item, (list, tuple)) and len(item) == 2:
             item = list(map(lambda i: re.sub(r'_NODE_', node, i), item))
             _ = util.syscall("gcloud compute scp {} {} {}".format(
                 "--zone=" + self.location if self.location else "",
                 item[0], item[1]),
                              return_outputs=silent)
         else:
             raise util.TFCliError(
                 "ERROR: unknown ssh command structure. Needs to be str (ssh-command) "
                 "or list/tuple of exactly 2 str (scp).")
예제 #6
0
    def create(self):
        """
        Create the Kubernetes cluster with the options given in self.
        This also sets up the local kubectl app to point to the new cluster automatically.
        """
        print("+ Creating cluster: {}. This may take a few minutes ...".format(
            self.name_hyphenated))
        if self.num_gpus == 0:
            out = util.syscall(
                "gcloud container clusters create {} -m {} --disk-size {} --num-nodes {} {}"
                .format(self.name_hyphenated, self.machine_type,
                        self.disk_size, self.num_nodes,
                        "--zone " + self.location if self.location else ""),
                return_outputs="as_str")
        else:
            out = util.syscall(
                "gcloud container clusters create {} --enable-cloud-logging --enable-cloud-monitoring "
                "--accelerator type={},count={} {} -m {} --disk-size {} --enable-kubernetes-alpha "
                "--image-type UBUNTU --num-nodes {} --cluster-version 1.9.2-gke.1 --quiet"
                .format(self.name_hyphenated, self.gpu_type,
                        self.gpus_per_node,
                        "--zone " + self.location if self.location else "",
                        self.machine_type, self.disk_size, self.num_nodes),
                return_outputs="as_str")
        # check output of cluster generating code
        if re.search(r'error', out, re.IGNORECASE):
            raise util.TFCliError(out)
        else:
            print("+ Successfully created cluster.")
        self.instances, self.primary_name = util.get_compute_instance_specs(
            self.name_hyphenated)
        self.started = True

        # install NVIDIA drivers on machines per local kubectl
        if self.num_gpus > 0:
            print("+ Installing NVIDIA GPU drivers and k8s device plugins ...")
            util.syscall(
                "kubectl create -f https://raw.githubusercontent.com/GoogleCloudPlatform/"
                "container-engine-accelerators/k8s-1.9/daemonset.yaml")
            util.syscall(
                "kubectl delete -f https://raw.githubusercontent.com/kubernetes/kubernetes/"
                "release-1.9/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
            )
            util.syscall(
                "kubectl create -f https://raw.githubusercontent.com/kubernetes/kubernetes/"
                "release-1.9/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
            )

        print("+ Done. Cluster: {} created.".format(self.name_hyphenated))
예제 #7
0
def cmd_init(args):
    # check if there is already a .tensorforce file in this folder
    if os.path.isfile(".tensorforce.json"):
        # TODO: read .tensorforce.json file to display project's name and other data
        if not args.force:
            print("WARNING: This directory already contains a tensorforce project. Would you like to overwrite it?")
            response = input(">")
            if response.upper() != "Y":
                quit()
        # erase the existing project and create a new one
        shutil.rmtree(".tensorforce.json")

    print("+ Creating project paths and copying sample spec files.")
    # add sub-dirs to it and write the main project file
    if not os.path.isdir("clusters"):
        os.makedirs("clusters")
    if not os.path.isdir("experiments"):
        os.makedirs("experiments")

    # copy all json example spec files from cloned github repo
    import tensorforce_client
    p = tensorforce_client.__path__[0] + "/configs"
    shutil.rmtree("configs/", ignore_errors=True)
    shutil.copytree("{}".format(p), "configs/")
    # add the experiment jinja file (k8s yaml template) into project's config dir
    shutil.copy("{}/experiment.yaml.jinja".format(p), "configs/")

    print("+ Checking requirements (gcloud and kubectl installations).")
    # check for installations of gcloud, then kubectl
    try:
        out = syscall("gcloud --version", return_outputs="as_str", merge_err=True)
    # Linux: fake Win command not found error
    except OSError:
        out = "not recognized as an internal"
    if re.match(r'not recognized as an internal', out):
        print("INIT ERROR: Installation of gcloud command line tool required.\nPlease install first:"
              " https://cloud.google.com/sdk/docs/quickstarts")
        quit()

    # we can install kubectl via gcloud: `gcloud components install kubectl`
    try:
        out = syscall("kubectl version", return_outputs="as_str", merge_err=True)
    # Linux: fake Win command not found error
    except OSError:
        out = "not recognized as an internal"

    if re.match(r'not recognized as an internal', out):
        print("++ Installing missing kubectl command line tool (this is necessary to manage your clusters via the"
              " Kubernetes tool):")
        syscall("gcloud components install kubectl")

    # login to google cloud
    print("+ Logging you into google cloud account.")
    while True:
        print("Please enter your remote project's google cloud service account (full email address) here:")
        service_account = input(">")
        if not re.match(r'^[\w\-\.]+\@[\w\-\.]+\.[a-z]+', service_account):
            print("ERROR: The service account needs to be an email address.")
        else:
            break
    while True:
        print("Please enter the location of your private key file associated with this service account:")
        key_file = input(">")
        if not os.path.isfile(key_file):
            print("ERROR: The key_file you entered does not exist or is not a file.")
        else:
            break
    # kubernetes-account@introkubernetes-191608.iam.gserviceaccount.com
    # l:/programming/privatekeys/MaRLEnE-bbad55cddab1.json
    syscall("gcloud auth activate-service-account {} --key-file={}".format(service_account, key_file))

    remote_projects_by_name, remote_projects_by_id = util.get_remote_projects()
    # if remote given -> only check for that one and exit if doesn't exist
    if args.remote_project_id:
        print("+ Checking for existing remote-project ID ({}).".format(args.remote_project_id))
        if args.remote_project_id not in remote_projects_by_id:
            print("ERROR: No remote project ID {} found in cloud!".format(args.remote_project_id))
            quit()
        print("+ Found remote project ID {}.".format(args.remote_project_id))
        remote_project_id = args.remote_project_id
        remote_project_name = remote_projects_by_id[args.remote_project_id]["project-name"]
        # if no name -> take remote's name
        if not args.name:
            args.name = remote_project_name
    # look for existing project in cloud with same name and ask whether to use that one. If not, user can specify
    # a remote project name that may differ from the local project folder's name
    else:
        # if no name -> take folder name
        if not args.name:
            cwd = os.getcwd()
            args.name = re.sub(r'.+[/\\](\w+)$', '\\1', cwd)
            print("+ Name not given. Assuming folder name ({}) is project's name.".format(args.name))

        print("+ Checking name ({}) against existing projects in the cloud.".format(args.name))
        if args.name in remote_projects_by_name:
            remote_project_name = args.name
            remote_project_id = remote_projects_by_name[remote_project_name]["project-id"]
            print("++ Given project name ({}) already exists as a project in google cloud. "
                  "Will use remote project {} with ID {}.".format(args.name, args.name, remote_project_id))
        # TODO: service accounts cannot create projects without a parent
        else:
            remote_project_id = re.sub(r'_', '-', args.name)  # replace all underscores with hyphens
            remote_project_name = remote_project_id
            # what if id already exists -> use a modified one
            if remote_project_id in remote_projects_by_id:
                remote_project_id = remote_project_id + str(time.time())
            print("+ Project '{}' does not exist in cloud yet. Will create new project (with ID {}).".
                  format(args.name, remote_project_id))
            out, err = syscall("gcloud projects create {} --name {} --set-as-default".
                               format(remote_project_id, remote_project_name), return_outputs=True, merge_err=False)
            err_msg = err.raw.readall().decode("latin-1")
            if err_msg != b"":
                raise util.TFCliError(err_msg)

    # write project settings into .tensorforce dir
    util.write_project_file(args.name, remote_project_name, remote_project_id)
예제 #8
0
    def setup_cluster(self, cluster, project_id, start=False):
        """
        Given a cluster name (or None) and a remote project-ID,
        sets up the cluster settings for this Experiment locally.
        Also starts the cluster if start is set to True.

        Args:
            cluster (str): The name of the cluster. If None, will get cluster-spec from the Experiment, or create a
                default Cluster object.
            project_id (str): The remote gcloud project ID.
            start (bool): Whether to already create (start) the cluster in the cloud.

        Returns: The Cluster object.

        """

        clusters = util.get_cluster_specs()

        # cluster is given (separate from experiment's own cluster)
        if cluster:
            cluster = get_cluster_from_string(cluster,
                                              running_clusters=clusters)
            self.has_dedicated_cluster = False
        # use experiment's own cluster
        elif self.cluster:
            cluster = Cluster(running_clusters=clusters, **self.cluster)
            self.has_dedicated_cluster = True
        # use a default cluster
        else:
            cluster = Cluster(name=self.name_hyphenated)
            self.has_dedicated_cluster = True

        # start cluster if not up yet
        if start and not cluster.started:
            cluster.create()
        # cluster up but not in good state
        elif clusters[cluster.name_hyphenated]["status"] != "RUNNING":
            raise util.TFCliError(
                "ERROR: Given cluster {} is not in status RUNNING (but in status {})!"
                .format(cluster.name_hyphenated,
                        clusters[cluster.name_hyphenated]["status"]))

        # check cluster vs experiment setup and warn or abort if something doesn't match
        if self.run_mode != "distributed" and cluster.num_nodes > 1:
            warn(
                "WARNING: Running non-distributed experiment on cluster with more than 1 node. Make sure you are "
                "not wasting costly resources!")
        num_gpus = cluster.num_nodes * cluster.gpus_per_node
        if self.run_mode == "distributed" and self.num_workers + self.num_parameter_servers > num_gpus:
            warn(
                "WARNING: Running distributed experiment with {} processes total on cluster with only {} GPUs! "
                "This could lead to K8s scheduling problems.".format(
                    self.num_workers + self.num_parameter_servers, num_gpus))

        print("+ Setting up credentials to connect to cluster {}.".format(
            cluster.name_hyphenated))
        util.syscall(
            "gcloud container clusters get-credentials {} --zone {} --project {}"
            .format(cluster.name_hyphenated, cluster.location, project_id))

        print("+ Setting kubectl to point to cluster {}.".format(
            cluster.name_hyphenated))
        util.syscall("kubectl config set-cluster {} --server={}".format(
            cluster.name_hyphenated,
            clusters[cluster.name_hyphenated]["master_ip"]))

        self.cluster = cluster.get_spec()
        return cluster