Пример #1
0
def get_experiment_from_string(experiment, running=False):
    """
    Returns an Experiment object given a string of either a json file or a name of an already existing eperiment.

    Args:
        experiment (str): The string to look for (either local json file or local experiment's name)
        running (bool): Whether this experiment is already running.

    Returns:
        The found Experiment object.
    """
    file = "experiments/{}/{}.json". \
        format(experiment, "experiment" if not running else "experiment_running")
    if not os.path.exists(file):
        if running:
            raise util.TFCliError(
                "ERROR: Experiment {} does not seem to be running right now! You have to create, then"
                "start it with 'experiment new/start'.".format(experiment))
        else:
            raise util.TFCliError(
                "ERROR: Experiment {} not found! You have to create it first with 'experiment new'."
                .format(experiment))
    # get the experiment object from its json file
    with open(file) as f:
        spec = json.load(f)
        exp_obj = Experiment(**spec)

    return exp_obj
Пример #2
0
def cmd_cluster_delete(args):
    if args.all:
        clusters = util.get_cluster_specs()
        for c in clusters.values():
            cluster = Cluster(**c)
            cluster.delete()
        print("+ All clusters deleted.")
    else:
        if not args.cluster:
            raise util.TFCliError("ERROR: Cluster name (-c option) not given!")
        print("+ Looking for clusters ...")
        cluster = get_cluster_from_string(args.cluster)
        if not isinstance(cluster, Cluster):
            raise util.TFCliError("ERROR: No cluster with name {} found!".format(args.cluster))
        cluster.delete()
Пример #3
0
def get_cluster_from_string(cluster, running_clusters=None):
    """
    Returns a Cluster object given a string of either a json file or an already running remote cluster's name.

    Args:
        cluster (str): The string to look for (either local json file or remote cluster's name)
        running_clusters (dict): Specs for already running cloud clusters by cluster name.

    Returns:
        The found Cluster object.
    """
    # no running clusters given -> get them now
    if not running_clusters:
        running_clusters = util.get_cluster_specs()

    # json file (get spec)
    if re.search(r'\.json$', cluster):
        cluster = Cluster(running_clusters=running_clusters,
                          file=cluster,
                          **util.read_json_spec(cluster, "clusters"))
    # cluster name (cluster must already exists in cloud)
    else:
        cluster_name = re.sub(r'_', '-', cluster)
        if cluster_name in running_clusters:
            cluster = Cluster(running_clusters=running_clusters,
                              **running_clusters[cluster_name])
        else:
            raise util.TFCliError(
                "ERROR: Given cluster {} not found in cloud!".format(
                    cluster_name))
    return cluster
Пример #4
0
    def create(self):
        """
        Create the Kubernetes cluster with the options given in self.
        This also sets up the local kubectl app to point to the new cluster automatically.
        """
        print("+ Creating cluster: {}. This may take a few minutes ...".format(
            self.name_hyphenated))
        if self.num_gpus == 0:
            out = util.syscall(
                "gcloud container clusters create {} -m {} --disk-size {} --num-nodes {} {}"
                .format(self.name_hyphenated, self.machine_type,
                        self.disk_size, self.num_nodes,
                        "--zone " + self.location if self.location else ""),
                return_outputs="as_str")
        else:
            out = util.syscall(
                "gcloud container clusters create {} --enable-cloud-logging --enable-cloud-monitoring "
                "--accelerator type={},count={} {} -m {} --disk-size {} --enable-kubernetes-alpha "
                "--image-type UBUNTU --num-nodes {} --cluster-version 1.9.2-gke.1 --quiet"
                .format(self.name_hyphenated, self.gpu_type,
                        self.gpus_per_node,
                        "--zone " + self.location if self.location else "",
                        self.machine_type, self.disk_size, self.num_nodes),
                return_outputs="as_str")
        # check output of cluster generating code
        if re.search(r'error', out, re.IGNORECASE):
            raise util.TFCliError(out)
        else:
            print("+ Successfully created cluster.")
        self.instances, self.primary_name = util.get_compute_instance_specs(
            self.name_hyphenated)
        self.started = True

        # install NVIDIA drivers on machines per local kubectl
        if self.num_gpus > 0:
            print("+ Installing NVIDIA GPU drivers and k8s device plugins ...")
            util.syscall(
                "kubectl create -f https://raw.githubusercontent.com/GoogleCloudPlatform/"
                "container-engine-accelerators/k8s-1.9/daemonset.yaml")
            util.syscall(
                "kubectl delete -f https://raw.githubusercontent.com/kubernetes/kubernetes/"
                "release-1.9/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
            )
            util.syscall(
                "kubectl create -f https://raw.githubusercontent.com/kubernetes/kubernetes/"
                "release-1.9/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
            )

        print("+ Done. Cluster: {} created.".format(self.name_hyphenated))
Пример #5
0
def cmd_experiment_new(args, project_id=None):
    # check for experiment already existing
    experiments = get_local_experiments()
    # setup the Experiment object
    experiment = Experiment(**args.__dict__)
    if experiment.name in experiments:
        print("ERROR: An experiment with the name {} already exists in this project! "
              "Use `experiment start` to start it.".format(experiment.name))
        return
    # write experiment files to local disk
    experiment.generate_locally()
    # and start the experiment?
    if args.start:
        if not project_id:
            raise util.TFCliError("ERROR: Cannot start experiment without remote project ID!")
        print("+ New experiment created. Starting ...")
        experiment.start(project_id)
    else:
        print("+ New experiment created. Use 'experiment start' to run.")
Пример #6
0
 def _ssh_parallel_target(self, node, silent, items):
     for item in items:
         # an ssh command to execute on the node
         if isinstance(item, str):
             _ = util.syscall(
                 "gcloud compute ssh {} {} --command \"{}\"".format(
                     node,
                     "--zone=" + self.location if self.location else "",
                     item),
                 return_outputs=silent)
         # an scp command (copy from ... to ...)
         elif isinstance(item, (list, tuple)) and len(item) == 2:
             item = list(map(lambda i: re.sub(r'_NODE_', node, i), item))
             _ = util.syscall("gcloud compute scp {} {} {}".format(
                 "--zone=" + self.location if self.location else "",
                 item[0], item[1]),
                              return_outputs=silent)
         else:
             raise util.TFCliError(
                 "ERROR: unknown ssh command structure. Needs to be str (ssh-command) "
                 "or list/tuple of exactly 2 str (scp).")
Пример #7
0
def cmd_init(args):
    # check if there is already a .tensorforce file in this folder
    if os.path.isfile(".tensorforce.json"):
        # TODO: read .tensorforce.json file to display project's name and other data
        if not args.force:
            print("WARNING: This directory already contains a tensorforce project. Would you like to overwrite it?")
            response = input(">")
            if response.upper() != "Y":
                quit()
        # erase the existing project and create a new one
        shutil.rmtree(".tensorforce.json")

    print("+ Creating project paths and copying sample spec files.")
    # add sub-dirs to it and write the main project file
    if not os.path.isdir("clusters"):
        os.makedirs("clusters")
    if not os.path.isdir("experiments"):
        os.makedirs("experiments")

    # copy all json example spec files from cloned github repo
    import tensorforce_client
    p = tensorforce_client.__path__[0] + "/configs"
    shutil.rmtree("configs/", ignore_errors=True)
    shutil.copytree("{}".format(p), "configs/")
    # add the experiment jinja file (k8s yaml template) into project's config dir
    shutil.copy("{}/experiment.yaml.jinja".format(p), "configs/")

    print("+ Checking requirements (gcloud and kubectl installations).")
    # check for installations of gcloud, then kubectl
    try:
        out = syscall("gcloud --version", return_outputs="as_str", merge_err=True)
    # Linux: fake Win command not found error
    except OSError:
        out = "not recognized as an internal"
    if re.match(r'not recognized as an internal', out):
        print("INIT ERROR: Installation of gcloud command line tool required.\nPlease install first:"
              " https://cloud.google.com/sdk/docs/quickstarts")
        quit()

    # we can install kubectl via gcloud: `gcloud components install kubectl`
    try:
        out = syscall("kubectl version", return_outputs="as_str", merge_err=True)
    # Linux: fake Win command not found error
    except OSError:
        out = "not recognized as an internal"

    if re.match(r'not recognized as an internal', out):
        print("++ Installing missing kubectl command line tool (this is necessary to manage your clusters via the"
              " Kubernetes tool):")
        syscall("gcloud components install kubectl")

    # login to google cloud
    print("+ Logging you into google cloud account.")
    while True:
        print("Please enter your remote project's google cloud service account (full email address) here:")
        service_account = input(">")
        if not re.match(r'^[\w\-\.]+\@[\w\-\.]+\.[a-z]+', service_account):
            print("ERROR: The service account needs to be an email address.")
        else:
            break
    while True:
        print("Please enter the location of your private key file associated with this service account:")
        key_file = input(">")
        if not os.path.isfile(key_file):
            print("ERROR: The key_file you entered does not exist or is not a file.")
        else:
            break
    # kubernetes-account@introkubernetes-191608.iam.gserviceaccount.com
    # l:/programming/privatekeys/MaRLEnE-bbad55cddab1.json
    syscall("gcloud auth activate-service-account {} --key-file={}".format(service_account, key_file))

    remote_projects_by_name, remote_projects_by_id = util.get_remote_projects()
    # if remote given -> only check for that one and exit if doesn't exist
    if args.remote_project_id:
        print("+ Checking for existing remote-project ID ({}).".format(args.remote_project_id))
        if args.remote_project_id not in remote_projects_by_id:
            print("ERROR: No remote project ID {} found in cloud!".format(args.remote_project_id))
            quit()
        print("+ Found remote project ID {}.".format(args.remote_project_id))
        remote_project_id = args.remote_project_id
        remote_project_name = remote_projects_by_id[args.remote_project_id]["project-name"]
        # if no name -> take remote's name
        if not args.name:
            args.name = remote_project_name
    # look for existing project in cloud with same name and ask whether to use that one. If not, user can specify
    # a remote project name that may differ from the local project folder's name
    else:
        # if no name -> take folder name
        if not args.name:
            cwd = os.getcwd()
            args.name = re.sub(r'.+[/\\](\w+)$', '\\1', cwd)
            print("+ Name not given. Assuming folder name ({}) is project's name.".format(args.name))

        print("+ Checking name ({}) against existing projects in the cloud.".format(args.name))
        if args.name in remote_projects_by_name:
            remote_project_name = args.name
            remote_project_id = remote_projects_by_name[remote_project_name]["project-id"]
            print("++ Given project name ({}) already exists as a project in google cloud. "
                  "Will use remote project {} with ID {}.".format(args.name, args.name, remote_project_id))
        # TODO: service accounts cannot create projects without a parent
        else:
            remote_project_id = re.sub(r'_', '-', args.name)  # replace all underscores with hyphens
            remote_project_name = remote_project_id
            # what if id already exists -> use a modified one
            if remote_project_id in remote_projects_by_id:
                remote_project_id = remote_project_id + str(time.time())
            print("+ Project '{}' does not exist in cloud yet. Will create new project (with ID {}).".
                  format(args.name, remote_project_id))
            out, err = syscall("gcloud projects create {} --name {} --set-as-default".
                               format(remote_project_id, remote_project_name), return_outputs=True, merge_err=False)
            err_msg = err.raw.readall().decode("latin-1")
            if err_msg != b"":
                raise util.TFCliError(err_msg)

    # write project settings into .tensorforce dir
    util.write_project_file(args.name, remote_project_name, remote_project_id)
Пример #8
0
    def __init__(self, **kwargs):
        """
        Keyword Args:
            file (str): The Experiment's json spec file (can contain all other args).
            name (str): The name of the Experiment. This is also the name of the folder where it is stored.
            environment (str): The filename of the json env-spec file to use (see TensorForce documentation).
            agent (str): The filename of the json agent-spec file to use (see TensorForce documentation).
            network (str):  The filename of the json network-spec file to use (see TensorForce documentation).
            cluster (str): The filename of the json cluster-spec file to use (see class `Cluster`).
            episodes (int): The total number of episodes to run (all parallel agents).
            total_timesteps (int): The max. total number of timesteps to run (all parallel agents).
            max_timesteps_per_episode (int): The max. number of timesteps to run in each episode.
            deterministic (bool): Whether to not(!) use stochastic exploration on top of plain action outputs.
            repeat_actions (int): The number of actions to repeat for each action selection (by calling agent.act()).
            debug_logging (bool): Whether to switch on debug logging (default: False).
            run_mode (str): Which runner mode to use. Valid values are only 'single', 'multi-threaded' and
                'distributed'.
            num_workers (int): The number of worker processes to use (see `distributed` and `multi-threaded`
                run_modes).
            num_parameter_servers (int): The number of parameter servers to use (see distributed tensorflow).
            saver_frequency (str): The frequency with which to save the model. This is a combination of an int
                and a unit (e.g. "600s"), where unit can be "s" (seconds), "e" (episodes), or "t" (timesteps).
            summary_frequency (str): The frequency with which to save a tensorboard summary.
                This is a combination of an int and a unit (e.g. "600s"), where unit can be "s" (seconds)
                or "t" (timesteps). The episode unit (e) is not allowed here.
        """
        # see whether we have a json (yaml?) file for the experiment
        # TODO: yaml support
        self.file = kwargs.get("file")
        if self.file:
            from_json = util.read_json_spec(self.file, "experiments")
        # get all attributes from kwargs
        else:
            from_json = {}
        # From here on, give kwargs priority over spec (from file), so that single settings in the json file can be
        # overwritten by command line.

        # sanity check name
        self.name = kwargs.get("name") or from_json.get("name", "")
        if not re.match(r'^\w+$', self.name):
            raise util.TFCliError(
                "ERROR: Name of experiment needs to be all alphanumeric characters"
            )
        self.name_hyphenated = re.sub(r'_', '-', self.name)

        self.path = "experiments/{}/".format(self.name)
        self.k8s_config = "{}experiment.yaml".format(self.path)

        # read in sub-spec files (to JSON)
        self.environment = kwargs.get("environment") or from_json.get(
            "environment")
        if isinstance(self.environment, str):
            self.environment = util.read_json_spec(self.environment,
                                                   "environments")
        if self.environment.get(
                "remote") and not self.environment.get("image"):
            raise util.TFCliError(
                "WARNING: Defining a remote environment without a docker image in experiment spec! "
                "Use field `image` to define a docker image for the remote env."
            )

        self.network = kwargs.get("network") or from_json.get("network")
        if isinstance(self.network, str):
            self.network = util.read_json_spec(self.network, "networks")

        self.agent = kwargs.get("agent") or from_json.get("agent")
        if isinstance(self.agent, str):
            self.agent = util.read_json_spec(self.agent, "agents")

        self.cluster = kwargs.get("cluster") or from_json.get("cluster")
        if isinstance(self.cluster, str):
            cluster = get_cluster_from_string(self.cluster)
            self.cluster = cluster.get_spec()
        elif not isinstance(self.cluster, dict):
            raise util.TFCliError(
                "ERROR: Cluster (-c option) has to be given as json filename.")

        self.episodes = kwargs.get("episodes") or from_json.get(
            "episodes", 10000)
        self.total_timesteps = kwargs.get("total_timesteps") or from_json.get(
            "total_timesteps", 1000000)
        self.max_timesteps_per_episode = kwargs.get("max_timesteps_per_episode") or \
                                         from_json.get("max_timesteps_per_episode", 1000)
        self.deterministic = kwargs.get("deterministic")
        if self.deterministic is None:
            self.deterministic = from_json.get("deterministic", False)
        self.repeat_actions = kwargs.get("repeat_actions") or from_json.get(
            "repeat_actions", 1)

        self.num_workers = kwargs.get("num_workers") or from_json.get(
            "num_workers", 3)
        self.num_parameter_servers = kwargs.get(
            "num_parameter_servers") or from_json.get("num_parameter_servers",
                                                      1)

        # update our json file pointer and write us into the experiment's dir
        self.file = "{}experiment.json".format(self.path)
        self.debug_logging = kwargs.get("debug_logging") or from_json.get(
            "debug_logging", False)

        # the experiment's run type
        self.run_mode = kwargs.get("run_mode") or from_json.get(
            "run_mode", "distributed")
        assert self.run_mode in ["distributed", "multi-threaded", "single"],\
            "ERROR: run-type needs to be one of distributed|multi-threaded|single!"
        if self.run_mode == "distributed" and self.num_parameter_servers <= 0:
            raise util.TFCliError(
                "ERROR: Cannot create experiment of run-mode=distributed and zero parameter servers!"
            )

        self.saver_frequency = kwargs.get("saver_frequency")\
            or from_json.get("saver_frequency", "600s" if self.run_mode == "distributed" else "100e")
        self.summary_frequency = kwargs.get("summary_frequency")\
            or from_json.get("summary_frequency", "120s" if self.run_mode == "distributed" else "10e")

        # whether this experiment runs on a dedicated cluster
        self.has_dedicated_cluster = kwargs.get(
            "has_dedicated_cluster") or from_json.get("has_dedicated_cluster",
                                                      True)

        # status (running, paused, stopped, etc..)
        self.status = kwargs.get("status") or from_json.get("status", None)

        # json file specific to a certain experiment 'run' (e.g. cluster may differ from experiment's base config)
        self.running_json_file = "experiment_running.json"
Пример #9
0
    def setup_cluster(self, cluster, project_id, start=False):
        """
        Given a cluster name (or None) and a remote project-ID,
        sets up the cluster settings for this Experiment locally.
        Also starts the cluster if start is set to True.

        Args:
            cluster (str): The name of the cluster. If None, will get cluster-spec from the Experiment, or create a
                default Cluster object.
            project_id (str): The remote gcloud project ID.
            start (bool): Whether to already create (start) the cluster in the cloud.

        Returns: The Cluster object.

        """

        clusters = util.get_cluster_specs()

        # cluster is given (separate from experiment's own cluster)
        if cluster:
            cluster = get_cluster_from_string(cluster,
                                              running_clusters=clusters)
            self.has_dedicated_cluster = False
        # use experiment's own cluster
        elif self.cluster:
            cluster = Cluster(running_clusters=clusters, **self.cluster)
            self.has_dedicated_cluster = True
        # use a default cluster
        else:
            cluster = Cluster(name=self.name_hyphenated)
            self.has_dedicated_cluster = True

        # start cluster if not up yet
        if start and not cluster.started:
            cluster.create()
        # cluster up but not in good state
        elif clusters[cluster.name_hyphenated]["status"] != "RUNNING":
            raise util.TFCliError(
                "ERROR: Given cluster {} is not in status RUNNING (but in status {})!"
                .format(cluster.name_hyphenated,
                        clusters[cluster.name_hyphenated]["status"]))

        # check cluster vs experiment setup and warn or abort if something doesn't match
        if self.run_mode != "distributed" and cluster.num_nodes > 1:
            warn(
                "WARNING: Running non-distributed experiment on cluster with more than 1 node. Make sure you are "
                "not wasting costly resources!")
        num_gpus = cluster.num_nodes * cluster.gpus_per_node
        if self.run_mode == "distributed" and self.num_workers + self.num_parameter_servers > num_gpus:
            warn(
                "WARNING: Running distributed experiment with {} processes total on cluster with only {} GPUs! "
                "This could lead to K8s scheduling problems.".format(
                    self.num_workers + self.num_parameter_servers, num_gpus))

        print("+ Setting up credentials to connect to cluster {}.".format(
            cluster.name_hyphenated))
        util.syscall(
            "gcloud container clusters get-credentials {} --zone {} --project {}"
            .format(cluster.name_hyphenated, cluster.location, project_id))

        print("+ Setting kubectl to point to cluster {}.".format(
            cluster.name_hyphenated))
        util.syscall("kubectl config set-cluster {} --server={}".format(
            cluster.name_hyphenated,
            clusters[cluster.name_hyphenated]["master_ip"]))

        self.cluster = cluster.get_spec()
        return cluster
Пример #10
0
    def __init__(self, **kwargs):
        r"""
        A cloud cluster object specifying things like: number of nodes, GPUs per node and GPU type, memory per node,
        disk size, zone, etc..

        Args:
            kwargs (any): See below.

        Keyword Args:
            file (str): The filename of a cluster spec json file to use. Single settings in this file
                can be overwritten by specifying these in further kwargs to this c'tor.
            name (str): The name of the cluster.
            machine_type (str): The machine type to use for all nodes in the cluster. Machine types can
                either be gcloud-accepted strings such as everything listed in `gcloud compute machine-types list`
                or custom strings that conform to these rules:
                https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type.
                When the kwargs `cpus_per_node` and `memory_per_node` are given,
                tensorforce-client will automatically create the correct machine-type.
            cpus_per_node (int): The number of vCPUs per node.
            gpus_per_node (int): The number of (physical) GPUs per node.
            gpu_type (str): The GPU type to use. Supported are only 'nvidia-tesla-k80' and 'nvidia-tesla-p100'.
            memory_per_node (int): The memory (in Gb) per node.
            num_nodes (int): The number of nodes for the cluster.
            disk_size (int): The amount of disk space per node in Gb.
            location (str): The location of the cluster. Default us the gcloud/project set default zone.
        """

        self.file = kwargs.get("file")
        if self.file:
            from_json = util.read_json_spec(self.file, "clusters")
        # get all attributes from kwargs
        else:
            from_json = {}

        self.name = kwargs.get("name") or from_json.get("name")
        if not self.name:
            raise util.TFCliError("ERROR: Cluster requires a name!")
        self.name_hyphenated = re.sub(r'_', '-', self.name)
        self.machine_type = kwargs.get("machine_type") or from_json.get(
            "machine_type")
        # alternative to machine_type -> provide `cpus_per_node` and `memory_per_node`
        if not self.machine_type:
            cpus = kwargs.get("cpus_per_node") or from_json.get(
                "cpus_per_node")
            mem = kwargs.get("memory_per_node") or from_json.get(
                "memory_per_node")
            if not cpus or not mem:
                raise util.TFCliError(
                    "ERROR: no vCPUs_per_node OR no memory_per_node given for cluster {}"
                    .format(self.name))
            self.machine_type = "custom-{}-{}".format(cpus, mem * 1024)
        self.num_nodes = kwargs.get("num_nodes") or from_json.get(
            "num_nodes", 3)
        self.num_gpus = 0
        self.gpu_type = None
        self.gpus_per_node = kwargs.get("gpus_per_node") or from_json.get(
            "gpus_per_node", 0)
        if self.gpus_per_node > 0:
            self.gpu_type = kwargs.get("gpu_type") or from_json.get(
                "gpu_type", "nvidia-tesla-k80")
            self.num_gpus = self.gpus_per_node * self.num_nodes
        # size of single disks (one per node)
        self.disk_size = kwargs.get("disk_size") or from_json.get(
            "disk_size", 100)
        self.location = kwargs.get("location") or from_json.get("location")

        # add information from running clusters
        if "running_clusters" in kwargs:
            self.instances, self.primary_name = util.get_compute_instance_specs(
                self.name_hyphenated)
            self.started = True if self.instances else False
        # cluster is not running yet
        else:
            self.instances = None
            self.primary_name = None
            self.started = False

        self.deleted = False  # is terminated or being shut down right now?