def get_experiment_from_string(experiment, running=False): """ Returns an Experiment object given a string of either a json file or a name of an already existing eperiment. Args: experiment (str): The string to look for (either local json file or local experiment's name) running (bool): Whether this experiment is already running. Returns: The found Experiment object. """ file = "experiments/{}/{}.json". \ format(experiment, "experiment" if not running else "experiment_running") if not os.path.exists(file): if running: raise util.TFCliError( "ERROR: Experiment {} does not seem to be running right now! You have to create, then" "start it with 'experiment new/start'.".format(experiment)) else: raise util.TFCliError( "ERROR: Experiment {} not found! You have to create it first with 'experiment new'." .format(experiment)) # get the experiment object from its json file with open(file) as f: spec = json.load(f) exp_obj = Experiment(**spec) return exp_obj
def cmd_cluster_delete(args): if args.all: clusters = util.get_cluster_specs() for c in clusters.values(): cluster = Cluster(**c) cluster.delete() print("+ All clusters deleted.") else: if not args.cluster: raise util.TFCliError("ERROR: Cluster name (-c option) not given!") print("+ Looking for clusters ...") cluster = get_cluster_from_string(args.cluster) if not isinstance(cluster, Cluster): raise util.TFCliError("ERROR: No cluster with name {} found!".format(args.cluster)) cluster.delete()
def get_cluster_from_string(cluster, running_clusters=None): """ Returns a Cluster object given a string of either a json file or an already running remote cluster's name. Args: cluster (str): The string to look for (either local json file or remote cluster's name) running_clusters (dict): Specs for already running cloud clusters by cluster name. Returns: The found Cluster object. """ # no running clusters given -> get them now if not running_clusters: running_clusters = util.get_cluster_specs() # json file (get spec) if re.search(r'\.json$', cluster): cluster = Cluster(running_clusters=running_clusters, file=cluster, **util.read_json_spec(cluster, "clusters")) # cluster name (cluster must already exists in cloud) else: cluster_name = re.sub(r'_', '-', cluster) if cluster_name in running_clusters: cluster = Cluster(running_clusters=running_clusters, **running_clusters[cluster_name]) else: raise util.TFCliError( "ERROR: Given cluster {} not found in cloud!".format( cluster_name)) return cluster
def create(self): """ Create the Kubernetes cluster with the options given in self. This also sets up the local kubectl app to point to the new cluster automatically. """ print("+ Creating cluster: {}. This may take a few minutes ...".format( self.name_hyphenated)) if self.num_gpus == 0: out = util.syscall( "gcloud container clusters create {} -m {} --disk-size {} --num-nodes {} {}" .format(self.name_hyphenated, self.machine_type, self.disk_size, self.num_nodes, "--zone " + self.location if self.location else ""), return_outputs="as_str") else: out = util.syscall( "gcloud container clusters create {} --enable-cloud-logging --enable-cloud-monitoring " "--accelerator type={},count={} {} -m {} --disk-size {} --enable-kubernetes-alpha " "--image-type UBUNTU --num-nodes {} --cluster-version 1.9.2-gke.1 --quiet" .format(self.name_hyphenated, self.gpu_type, self.gpus_per_node, "--zone " + self.location if self.location else "", self.machine_type, self.disk_size, self.num_nodes), return_outputs="as_str") # check output of cluster generating code if re.search(r'error', out, re.IGNORECASE): raise util.TFCliError(out) else: print("+ Successfully created cluster.") self.instances, self.primary_name = util.get_compute_instance_specs( self.name_hyphenated) self.started = True # install NVIDIA drivers on machines per local kubectl if self.num_gpus > 0: print("+ Installing NVIDIA GPU drivers and k8s device plugins ...") util.syscall( "kubectl create -f https://raw.githubusercontent.com/GoogleCloudPlatform/" "container-engine-accelerators/k8s-1.9/daemonset.yaml") util.syscall( "kubectl delete -f https://raw.githubusercontent.com/kubernetes/kubernetes/" "release-1.9/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml" ) util.syscall( "kubectl create -f https://raw.githubusercontent.com/kubernetes/kubernetes/" "release-1.9/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml" ) print("+ Done. Cluster: {} created.".format(self.name_hyphenated))
def cmd_experiment_new(args, project_id=None): # check for experiment already existing experiments = get_local_experiments() # setup the Experiment object experiment = Experiment(**args.__dict__) if experiment.name in experiments: print("ERROR: An experiment with the name {} already exists in this project! " "Use `experiment start` to start it.".format(experiment.name)) return # write experiment files to local disk experiment.generate_locally() # and start the experiment? if args.start: if not project_id: raise util.TFCliError("ERROR: Cannot start experiment without remote project ID!") print("+ New experiment created. Starting ...") experiment.start(project_id) else: print("+ New experiment created. Use 'experiment start' to run.")
def _ssh_parallel_target(self, node, silent, items): for item in items: # an ssh command to execute on the node if isinstance(item, str): _ = util.syscall( "gcloud compute ssh {} {} --command \"{}\"".format( node, "--zone=" + self.location if self.location else "", item), return_outputs=silent) # an scp command (copy from ... to ...) elif isinstance(item, (list, tuple)) and len(item) == 2: item = list(map(lambda i: re.sub(r'_NODE_', node, i), item)) _ = util.syscall("gcloud compute scp {} {} {}".format( "--zone=" + self.location if self.location else "", item[0], item[1]), return_outputs=silent) else: raise util.TFCliError( "ERROR: unknown ssh command structure. Needs to be str (ssh-command) " "or list/tuple of exactly 2 str (scp).")
def cmd_init(args): # check if there is already a .tensorforce file in this folder if os.path.isfile(".tensorforce.json"): # TODO: read .tensorforce.json file to display project's name and other data if not args.force: print("WARNING: This directory already contains a tensorforce project. Would you like to overwrite it?") response = input(">") if response.upper() != "Y": quit() # erase the existing project and create a new one shutil.rmtree(".tensorforce.json") print("+ Creating project paths and copying sample spec files.") # add sub-dirs to it and write the main project file if not os.path.isdir("clusters"): os.makedirs("clusters") if not os.path.isdir("experiments"): os.makedirs("experiments") # copy all json example spec files from cloned github repo import tensorforce_client p = tensorforce_client.__path__[0] + "/configs" shutil.rmtree("configs/", ignore_errors=True) shutil.copytree("{}".format(p), "configs/") # add the experiment jinja file (k8s yaml template) into project's config dir shutil.copy("{}/experiment.yaml.jinja".format(p), "configs/") print("+ Checking requirements (gcloud and kubectl installations).") # check for installations of gcloud, then kubectl try: out = syscall("gcloud --version", return_outputs="as_str", merge_err=True) # Linux: fake Win command not found error except OSError: out = "not recognized as an internal" if re.match(r'not recognized as an internal', out): print("INIT ERROR: Installation of gcloud command line tool required.\nPlease install first:" " https://cloud.google.com/sdk/docs/quickstarts") quit() # we can install kubectl via gcloud: `gcloud components install kubectl` try: out = syscall("kubectl version", return_outputs="as_str", merge_err=True) # Linux: fake Win command not found error except OSError: out = "not recognized as an internal" if re.match(r'not recognized as an internal', out): print("++ Installing missing kubectl command line tool (this is necessary to manage your clusters via the" " Kubernetes tool):") syscall("gcloud components install kubectl") # login to google cloud print("+ Logging you into google cloud account.") while True: print("Please enter your remote project's google cloud service account (full email address) here:") service_account = input(">") if not re.match(r'^[\w\-\.]+\@[\w\-\.]+\.[a-z]+', service_account): print("ERROR: The service account needs to be an email address.") else: break while True: print("Please enter the location of your private key file associated with this service account:") key_file = input(">") if not os.path.isfile(key_file): print("ERROR: The key_file you entered does not exist or is not a file.") else: break # kubernetes-account@introkubernetes-191608.iam.gserviceaccount.com # l:/programming/privatekeys/MaRLEnE-bbad55cddab1.json syscall("gcloud auth activate-service-account {} --key-file={}".format(service_account, key_file)) remote_projects_by_name, remote_projects_by_id = util.get_remote_projects() # if remote given -> only check for that one and exit if doesn't exist if args.remote_project_id: print("+ Checking for existing remote-project ID ({}).".format(args.remote_project_id)) if args.remote_project_id not in remote_projects_by_id: print("ERROR: No remote project ID {} found in cloud!".format(args.remote_project_id)) quit() print("+ Found remote project ID {}.".format(args.remote_project_id)) remote_project_id = args.remote_project_id remote_project_name = remote_projects_by_id[args.remote_project_id]["project-name"] # if no name -> take remote's name if not args.name: args.name = remote_project_name # look for existing project in cloud with same name and ask whether to use that one. If not, user can specify # a remote project name that may differ from the local project folder's name else: # if no name -> take folder name if not args.name: cwd = os.getcwd() args.name = re.sub(r'.+[/\\](\w+)$', '\\1', cwd) print("+ Name not given. Assuming folder name ({}) is project's name.".format(args.name)) print("+ Checking name ({}) against existing projects in the cloud.".format(args.name)) if args.name in remote_projects_by_name: remote_project_name = args.name remote_project_id = remote_projects_by_name[remote_project_name]["project-id"] print("++ Given project name ({}) already exists as a project in google cloud. " "Will use remote project {} with ID {}.".format(args.name, args.name, remote_project_id)) # TODO: service accounts cannot create projects without a parent else: remote_project_id = re.sub(r'_', '-', args.name) # replace all underscores with hyphens remote_project_name = remote_project_id # what if id already exists -> use a modified one if remote_project_id in remote_projects_by_id: remote_project_id = remote_project_id + str(time.time()) print("+ Project '{}' does not exist in cloud yet. Will create new project (with ID {}).". format(args.name, remote_project_id)) out, err = syscall("gcloud projects create {} --name {} --set-as-default". format(remote_project_id, remote_project_name), return_outputs=True, merge_err=False) err_msg = err.raw.readall().decode("latin-1") if err_msg != b"": raise util.TFCliError(err_msg) # write project settings into .tensorforce dir util.write_project_file(args.name, remote_project_name, remote_project_id)
def __init__(self, **kwargs): """ Keyword Args: file (str): The Experiment's json spec file (can contain all other args). name (str): The name of the Experiment. This is also the name of the folder where it is stored. environment (str): The filename of the json env-spec file to use (see TensorForce documentation). agent (str): The filename of the json agent-spec file to use (see TensorForce documentation). network (str): The filename of the json network-spec file to use (see TensorForce documentation). cluster (str): The filename of the json cluster-spec file to use (see class `Cluster`). episodes (int): The total number of episodes to run (all parallel agents). total_timesteps (int): The max. total number of timesteps to run (all parallel agents). max_timesteps_per_episode (int): The max. number of timesteps to run in each episode. deterministic (bool): Whether to not(!) use stochastic exploration on top of plain action outputs. repeat_actions (int): The number of actions to repeat for each action selection (by calling agent.act()). debug_logging (bool): Whether to switch on debug logging (default: False). run_mode (str): Which runner mode to use. Valid values are only 'single', 'multi-threaded' and 'distributed'. num_workers (int): The number of worker processes to use (see `distributed` and `multi-threaded` run_modes). num_parameter_servers (int): The number of parameter servers to use (see distributed tensorflow). saver_frequency (str): The frequency with which to save the model. This is a combination of an int and a unit (e.g. "600s"), where unit can be "s" (seconds), "e" (episodes), or "t" (timesteps). summary_frequency (str): The frequency with which to save a tensorboard summary. This is a combination of an int and a unit (e.g. "600s"), where unit can be "s" (seconds) or "t" (timesteps). The episode unit (e) is not allowed here. """ # see whether we have a json (yaml?) file for the experiment # TODO: yaml support self.file = kwargs.get("file") if self.file: from_json = util.read_json_spec(self.file, "experiments") # get all attributes from kwargs else: from_json = {} # From here on, give kwargs priority over spec (from file), so that single settings in the json file can be # overwritten by command line. # sanity check name self.name = kwargs.get("name") or from_json.get("name", "") if not re.match(r'^\w+$', self.name): raise util.TFCliError( "ERROR: Name of experiment needs to be all alphanumeric characters" ) self.name_hyphenated = re.sub(r'_', '-', self.name) self.path = "experiments/{}/".format(self.name) self.k8s_config = "{}experiment.yaml".format(self.path) # read in sub-spec files (to JSON) self.environment = kwargs.get("environment") or from_json.get( "environment") if isinstance(self.environment, str): self.environment = util.read_json_spec(self.environment, "environments") if self.environment.get( "remote") and not self.environment.get("image"): raise util.TFCliError( "WARNING: Defining a remote environment without a docker image in experiment spec! " "Use field `image` to define a docker image for the remote env." ) self.network = kwargs.get("network") or from_json.get("network") if isinstance(self.network, str): self.network = util.read_json_spec(self.network, "networks") self.agent = kwargs.get("agent") or from_json.get("agent") if isinstance(self.agent, str): self.agent = util.read_json_spec(self.agent, "agents") self.cluster = kwargs.get("cluster") or from_json.get("cluster") if isinstance(self.cluster, str): cluster = get_cluster_from_string(self.cluster) self.cluster = cluster.get_spec() elif not isinstance(self.cluster, dict): raise util.TFCliError( "ERROR: Cluster (-c option) has to be given as json filename.") self.episodes = kwargs.get("episodes") or from_json.get( "episodes", 10000) self.total_timesteps = kwargs.get("total_timesteps") or from_json.get( "total_timesteps", 1000000) self.max_timesteps_per_episode = kwargs.get("max_timesteps_per_episode") or \ from_json.get("max_timesteps_per_episode", 1000) self.deterministic = kwargs.get("deterministic") if self.deterministic is None: self.deterministic = from_json.get("deterministic", False) self.repeat_actions = kwargs.get("repeat_actions") or from_json.get( "repeat_actions", 1) self.num_workers = kwargs.get("num_workers") or from_json.get( "num_workers", 3) self.num_parameter_servers = kwargs.get( "num_parameter_servers") or from_json.get("num_parameter_servers", 1) # update our json file pointer and write us into the experiment's dir self.file = "{}experiment.json".format(self.path) self.debug_logging = kwargs.get("debug_logging") or from_json.get( "debug_logging", False) # the experiment's run type self.run_mode = kwargs.get("run_mode") or from_json.get( "run_mode", "distributed") assert self.run_mode in ["distributed", "multi-threaded", "single"],\ "ERROR: run-type needs to be one of distributed|multi-threaded|single!" if self.run_mode == "distributed" and self.num_parameter_servers <= 0: raise util.TFCliError( "ERROR: Cannot create experiment of run-mode=distributed and zero parameter servers!" ) self.saver_frequency = kwargs.get("saver_frequency")\ or from_json.get("saver_frequency", "600s" if self.run_mode == "distributed" else "100e") self.summary_frequency = kwargs.get("summary_frequency")\ or from_json.get("summary_frequency", "120s" if self.run_mode == "distributed" else "10e") # whether this experiment runs on a dedicated cluster self.has_dedicated_cluster = kwargs.get( "has_dedicated_cluster") or from_json.get("has_dedicated_cluster", True) # status (running, paused, stopped, etc..) self.status = kwargs.get("status") or from_json.get("status", None) # json file specific to a certain experiment 'run' (e.g. cluster may differ from experiment's base config) self.running_json_file = "experiment_running.json"
def setup_cluster(self, cluster, project_id, start=False): """ Given a cluster name (or None) and a remote project-ID, sets up the cluster settings for this Experiment locally. Also starts the cluster if start is set to True. Args: cluster (str): The name of the cluster. If None, will get cluster-spec from the Experiment, or create a default Cluster object. project_id (str): The remote gcloud project ID. start (bool): Whether to already create (start) the cluster in the cloud. Returns: The Cluster object. """ clusters = util.get_cluster_specs() # cluster is given (separate from experiment's own cluster) if cluster: cluster = get_cluster_from_string(cluster, running_clusters=clusters) self.has_dedicated_cluster = False # use experiment's own cluster elif self.cluster: cluster = Cluster(running_clusters=clusters, **self.cluster) self.has_dedicated_cluster = True # use a default cluster else: cluster = Cluster(name=self.name_hyphenated) self.has_dedicated_cluster = True # start cluster if not up yet if start and not cluster.started: cluster.create() # cluster up but not in good state elif clusters[cluster.name_hyphenated]["status"] != "RUNNING": raise util.TFCliError( "ERROR: Given cluster {} is not in status RUNNING (but in status {})!" .format(cluster.name_hyphenated, clusters[cluster.name_hyphenated]["status"])) # check cluster vs experiment setup and warn or abort if something doesn't match if self.run_mode != "distributed" and cluster.num_nodes > 1: warn( "WARNING: Running non-distributed experiment on cluster with more than 1 node. Make sure you are " "not wasting costly resources!") num_gpus = cluster.num_nodes * cluster.gpus_per_node if self.run_mode == "distributed" and self.num_workers + self.num_parameter_servers > num_gpus: warn( "WARNING: Running distributed experiment with {} processes total on cluster with only {} GPUs! " "This could lead to K8s scheduling problems.".format( self.num_workers + self.num_parameter_servers, num_gpus)) print("+ Setting up credentials to connect to cluster {}.".format( cluster.name_hyphenated)) util.syscall( "gcloud container clusters get-credentials {} --zone {} --project {}" .format(cluster.name_hyphenated, cluster.location, project_id)) print("+ Setting kubectl to point to cluster {}.".format( cluster.name_hyphenated)) util.syscall("kubectl config set-cluster {} --server={}".format( cluster.name_hyphenated, clusters[cluster.name_hyphenated]["master_ip"])) self.cluster = cluster.get_spec() return cluster
def __init__(self, **kwargs): r""" A cloud cluster object specifying things like: number of nodes, GPUs per node and GPU type, memory per node, disk size, zone, etc.. Args: kwargs (any): See below. Keyword Args: file (str): The filename of a cluster spec json file to use. Single settings in this file can be overwritten by specifying these in further kwargs to this c'tor. name (str): The name of the cluster. machine_type (str): The machine type to use for all nodes in the cluster. Machine types can either be gcloud-accepted strings such as everything listed in `gcloud compute machine-types list` or custom strings that conform to these rules: https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type. When the kwargs `cpus_per_node` and `memory_per_node` are given, tensorforce-client will automatically create the correct machine-type. cpus_per_node (int): The number of vCPUs per node. gpus_per_node (int): The number of (physical) GPUs per node. gpu_type (str): The GPU type to use. Supported are only 'nvidia-tesla-k80' and 'nvidia-tesla-p100'. memory_per_node (int): The memory (in Gb) per node. num_nodes (int): The number of nodes for the cluster. disk_size (int): The amount of disk space per node in Gb. location (str): The location of the cluster. Default us the gcloud/project set default zone. """ self.file = kwargs.get("file") if self.file: from_json = util.read_json_spec(self.file, "clusters") # get all attributes from kwargs else: from_json = {} self.name = kwargs.get("name") or from_json.get("name") if not self.name: raise util.TFCliError("ERROR: Cluster requires a name!") self.name_hyphenated = re.sub(r'_', '-', self.name) self.machine_type = kwargs.get("machine_type") or from_json.get( "machine_type") # alternative to machine_type -> provide `cpus_per_node` and `memory_per_node` if not self.machine_type: cpus = kwargs.get("cpus_per_node") or from_json.get( "cpus_per_node") mem = kwargs.get("memory_per_node") or from_json.get( "memory_per_node") if not cpus or not mem: raise util.TFCliError( "ERROR: no vCPUs_per_node OR no memory_per_node given for cluster {}" .format(self.name)) self.machine_type = "custom-{}-{}".format(cpus, mem * 1024) self.num_nodes = kwargs.get("num_nodes") or from_json.get( "num_nodes", 3) self.num_gpus = 0 self.gpu_type = None self.gpus_per_node = kwargs.get("gpus_per_node") or from_json.get( "gpus_per_node", 0) if self.gpus_per_node > 0: self.gpu_type = kwargs.get("gpu_type") or from_json.get( "gpu_type", "nvidia-tesla-k80") self.num_gpus = self.gpus_per_node * self.num_nodes # size of single disks (one per node) self.disk_size = kwargs.get("disk_size") or from_json.get( "disk_size", 100) self.location = kwargs.get("location") or from_json.get("location") # add information from running clusters if "running_clusters" in kwargs: self.instances, self.primary_name = util.get_compute_instance_specs( self.name_hyphenated) self.started = True if self.instances else False # cluster is not running yet else: self.instances = None self.primary_name = None self.started = False self.deleted = False # is terminated or being shut down right now?