Exemplo n.º 1
0
    def init_spark_on_k8s(self,
                          master,
                          container_image,
                          num_executors,
                          executor_cores,
                          executor_memory="2g",
                          driver_memory="1g",
                          driver_cores=4,
                          extra_executor_memory_for_ray=None,
                          extra_python_lib=None,
                          conf=None,
                          jars=None,
                          python_location=None):
        print("Initializing SparkContext for k8s-client mode")
        python_env = "/".join(detect_python_location().split("/")[:-2])
        if "PYSPARK_PYTHON" not in os.environ:
            os.environ["PYSPARK_PYTHON"] = \
                python_location if python_location else detect_python_location()

        submit_args = "--master " + master + " --deploy-mode client"
        submit_args = submit_args + gen_submit_args(
            driver_cores, driver_memory, num_executors, executor_cores,
            executor_memory, extra_python_lib, jars)

        conf = enrich_conf_for_spark(conf, driver_cores, driver_memory,
                                     num_executors, executor_cores,
                                     executor_memory,
                                     extra_executor_memory_for_ray)
        py_version = ".".join(platform.python_version().split(".")[0:2])
        preload_so = python_env + "/lib/libpython" + py_version + "m.so"
        ld_path = python_env + "/lib:" + python_env + "/lib/python" +\
            py_version + "/lib-dynload"
        if "spark.executor.extraLibraryPath" in conf:
            ld_path = "{}:{}".format(ld_path,
                                     conf["spark.executor.extraLibraryPath"])
        conf.update({
            "spark.cores.max": num_executors * executor_cores,
            "spark.executorEnv.PYTHONHOME": python_env,
            "spark.executor.extraLibraryPath": ld_path,
            "spark.executorEnv.LD_PRELOAD": preload_so,
            "spark.kubernetes.container.image": container_image
        })
        # Not targeted to use pip install. BIGDL_CLASSPATH is supposed to set.
        if "BIGDL_CLASSPATH" in os.environ:
            zoo_bigdl_jar_path = os.environ["BIGDL_CLASSPATH"]
        else:
            zoo_bigdl_jar_path = ":".join(
                list(get_zoo_bigdl_classpath_on_driver()))
        if "spark.executor.extraClassPath" in conf:
            conf["spark.executor.extraClassPath"] = "{}:{}".format(
                zoo_bigdl_jar_path, conf["spark.executor.extraClassPath"])
        else:
            conf["spark.executor.extraClassPath"] = zoo_bigdl_jar_path

        sc = self.create_sc(submit_args, conf)
        return sc
Exemplo n.º 2
0
 def _get_conda_python_path(self):
     conda_env_path = "/".join(detect_python_location().split("/")[:-2])
     python_interpreters = glob.glob(
         "{}/lib/python*".format(conda_env_path))
     assert len(python_interpreters) == 1, \
         "Conda env should contain a single python, but got: {}".format(python_interpreters)
     return python_interpreters[0]
Exemplo n.º 3
0
 def init_spark_on_local(self, cores, conf=None, python_location=None):
     print("Start to getOrCreate SparkContext")
     if "PYSPARK_PYTHON" not in os.environ:
         os.environ["PYSPARK_PYTHON"] = \
             python_location if python_location else detect_python_location()
     master = "local[{}]".format(cores)
     zoo_conf = init_spark_conf(conf).setMaster(master)
     sc = init_nncontext(conf=zoo_conf, spark_log_level=self.spark_log_level,
                         redirect_spark_log=self.redirect_spark_log)
     print("Successfully got a SparkContext")
     return sc
Exemplo n.º 4
0
    def init_spark_on_k8s(self,
                          master,
                          container_image,
                          num_executors,
                          executor_cores,
                          executor_memory="2g",
                          driver_memory="1g",
                          driver_cores=4,
                          extra_executor_memory_for_ray=None,
                          extra_python_lib=None,
                          conf=None,
                          jars=None,
                          python_location=None):
        print("Initializing SparkContext for k8s-client mode")
        if "PYSPARK_PYTHON" not in os.environ:
            os.environ["PYSPARK_PYTHON"] = \
                python_location if python_location else detect_python_location()

        submit_args = "--master " + master + " --deploy-mode client"
        submit_args = submit_args + gen_submit_args(
            driver_cores, driver_memory, num_executors, executor_cores,
            executor_memory, extra_python_lib, jars)

        conf = enrich_conf_for_spark(conf, driver_cores, driver_memory,
                                     num_executors, executor_cores,
                                     executor_memory,
                                     extra_executor_memory_for_ray)
        conf.update({
            "spark.cores.max": num_executors * executor_cores,
            "spark.kubernetes.container.image": container_image
        })
        # Not targeted to use pip install. BIGDL_CLASSPATH is supposed to set.
        if "BIGDL_CLASSPATH" in os.environ:
            zoo_bigdl_jar_path = os.environ["BIGDL_CLASSPATH"]
        else:
            zoo_bigdl_jar_path = ":".join(
                list(get_zoo_bigdl_classpath_on_driver()))
        if "spark.executor.extraClassPath" in conf:
            conf["spark.executor.extraClassPath"] = "{}:{}".format(
                zoo_bigdl_jar_path, conf["spark.executor.extraClassPath"])
        else:
            conf["spark.executor.extraClassPath"] = zoo_bigdl_jar_path

        sc = self.create_sc(submit_args, conf)
        return sc
Exemplo n.º 5
0
    def init_spark_standalone(self,
                              num_executors,
                              executor_cores,
                              executor_memory="2g",
                              driver_cores=4,
                              driver_memory="1g",
                              master=None,
                              extra_executor_memory_for_ray=None,
                              extra_python_lib=None,
                              conf=None,
                              jars=None,
                              python_location=None,
                              enable_numa_binding=False):
        import subprocess
        import pyspark
        from zoo.util.utils import get_node_ip

        if "PYSPARK_PYTHON" not in os.environ:
            os.environ["PYSPARK_PYTHON"] = \
                python_location if python_location else detect_python_location()
        if not master:
            pyspark_home = os.path.abspath(pyspark.__file__ + "/../")
            zoo_standalone_home = os.path.abspath(
                __file__ + "/../../share/bin/standalone")
            node_ip = get_node_ip()
            SparkRunner.standalone_env = {
                "SPARK_HOME": pyspark_home,
                "ZOO_STANDALONE_HOME": zoo_standalone_home,
                # If not set this, by default master is hostname but not ip,
                "SPARK_MASTER_HOST": node_ip
            }
            if 'JAVA_HOME' in os.environ:
                SparkRunner.standalone_env["JAVA_HOME"] = os.environ[
                    "JAVA_HOME"]
            # The scripts installed from pip don't have execution permission
            # and need to first give them permission.
            pro = subprocess.Popen(
                ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)])
            os.waitpid(pro.pid, 0)
            # Start master
            start_master_pro = subprocess.Popen(
                "{}/sbin/start-master.sh".format(zoo_standalone_home),
                shell=True,
                env=SparkRunner.standalone_env)
            _, status = os.waitpid(start_master_pro.pid, 0)
            if status != 0:
                raise RuntimeError("starting master failed")
            master = "spark://{}:7077".format(
                node_ip)  # 7077 is the default port
            # Start worker
            if enable_numa_binding:
                worker_script = "start-worker-with-numactl.sh"
                SparkRunner.standalone_env["SPARK_WORKER_INSTANCES"] = str(
                    num_executors)
            else:
                worker_script = "start-worker.sh"
            start_worker_pro = subprocess.Popen("{}/sbin/{} {}".format(
                zoo_standalone_home, worker_script, master),
                                                shell=True,
                                                env=SparkRunner.standalone_env)
            _, status = os.waitpid(start_worker_pro.pid, 0)
            if status != 0:
                raise RuntimeError("starting worker failed")
        else:  # A Spark standalone cluster has already been started by the user.
            assert master.startswith("spark://"), \
                "Please input a valid master address for your Spark standalone cluster: " \
                "spark://master:port"

        # Start pyspark-shell
        submit_args = "--master " + master
        submit_args = submit_args + gen_submit_args(
            driver_cores, driver_memory, num_executors, executor_cores,
            executor_memory, extra_python_lib, jars)

        conf = enrich_conf_for_spark(conf, driver_cores, driver_memory,
                                     num_executors, executor_cores,
                                     executor_memory,
                                     extra_executor_memory_for_ray)
        conf.update({
            "spark.cores.max":
            num_executors * executor_cores,
            "spark.executorEnv.PYTHONHOME":
            "/".join(detect_python_location().split("/")[:-2])
        })
        zoo_bigdl_jar_path = ":".join(list(
            get_zoo_bigdl_classpath_on_driver()))
        if "spark.executor.extraClassPath" in conf:
            conf["spark.executor.extraClassPath"] = "{}:{}".format(
                zoo_bigdl_jar_path, conf["spark.executor.extraClassPath"])
        else:
            conf["spark.executor.extraClassPath"] = zoo_bigdl_jar_path

        sc = self.create_sc(submit_args, conf)
        return sc
Exemplo n.º 6
0
    def __init__(self,
                 sc,
                 redis_port=None,
                 password="******",
                 object_store_memory=None,
                 verbose=False,
                 env=None,
                 extra_params=None,
                 include_webui=True,
                 num_ray_nodes=None,
                 ray_node_cpu_cores=None):
        """
        The RayContext would initiate a ray cluster on top of the configuration of SparkContext.
        After creating RayContext, call the init method to set up the cluster.

        - For Spark local mode: The total available cores for Ray is equal to the number of
        Spark local cores.
        - For Spark cluster mode: The number of raylets to be created is equal to the number of
        Spark executors. The number of cores allocated for each raylet is equal to the number of
        cores for each Spark executor.
        You are allowed to specify num_ray_nodes and ray_node_cpu_cores for configurations
        to start raylets.

        :param sc: An instance of SparkContext.
        :param redis_port: The redis port for the ray head node. Default is None.
        The value would be randomly picked if not specified.
        :param password: The password for redis. Default to be "123456" if not specified.
        :param object_store_memory: The memory size for ray object_store in string.
        This can be specified in bytes(b), kilobytes(k), megabytes(m) or gigabytes(g).
        For example, "50b", "100k", "250m", "30g".
        :param verbose: True for more logs when starting ray. Default is False.
        :param env: The environment variable dict for running ray processes. Default is None.
        :param extra_params: The key value dict for extra options to launch ray.
        For example, extra_params={"temp-dir": "/tmp/ray/"}
        :param include_webui: True for including web ui when starting ray. Default is False.
        :param num_ray_nodes: The number of raylets to start across the cluster.
        For Spark local mode, you don't need to specify this value.
        For Spark cluster mode, it is default to be the number of Spark executors. If
        spark.executor.instances can't be detected in your SparkContext, you need to explicitly
        specify this. It is recommended that num_ray_nodes is not larger than the number of
        Spark executors to make sure there are enough resources in your cluster.
        :param ray_node_cpu_cores: The number of available cores for each raylet.
        For Spark local mode, it is default to be the number of Spark local cores.
        For Spark cluster mode, it is default to be the number of cores for each Spark executor. If
        spark.executor.cores or spark.cores.max can't be detected in your SparkContext, you need to
        explicitly specify this. It is recommended that ray_node_cpu_cores is not larger than the
        number of cores for each Spark executor to make sure there are enough resources in your
        cluster.
        """
        assert sc is not None, "sc cannot be None, please create a SparkContext first"
        self.sc = sc
        self.initialized = False
        self.is_local = is_local(sc)
        self.verbose = verbose
        self.redis_password = password
        self.object_store_memory = resource_to_bytes(object_store_memory)
        self.ray_processesMonitor = None
        self.env = env
        self.extra_params = extra_params
        self.include_webui = include_webui
        self._address_info = None
        if self.is_local:
            self.num_ray_nodes = 1
            spark_cores = self._get_spark_local_cores()
            if ray_node_cpu_cores:
                ray_node_cpu_cores = int(ray_node_cpu_cores)
                if ray_node_cpu_cores > spark_cores:
                    warnings.warn(
                        "ray_node_cpu_cores is larger than available Spark cores, "
                        "make sure there are enough resources on your machine")
                self.ray_node_cpu_cores = ray_node_cpu_cores
            else:
                self.ray_node_cpu_cores = spark_cores
        # For Spark local mode, directly call ray.init() and ray.shutdown().
        # ray.shutdown() would clear up all the ray related processes.
        # Ray Manager is only needed for Spark cluster mode to monitor ray processes.
        else:
            if self.sc.getConf().contains("spark.executor.cores"):
                executor_cores = int(
                    self.sc.getConf().get("spark.executor.cores"))
            else:
                executor_cores = None
            if ray_node_cpu_cores:
                ray_node_cpu_cores = int(ray_node_cpu_cores)
                if executor_cores and ray_node_cpu_cores > executor_cores:
                    warnings.warn(
                        "ray_node_cpu_cores is larger than Spark executor cores, "
                        "make sure there are enough resources on your cluster")
                self.ray_node_cpu_cores = ray_node_cpu_cores
            elif executor_cores:
                self.ray_node_cpu_cores = executor_cores
            else:
                raise Exception(
                    "spark.executor.cores not detected in the SparkContext, "
                    "you need to manually specify num_ray_nodes and ray_node_cpu_cores "
                    "for RayContext to start ray services")
            if self.sc.getConf().contains("spark.executor.instances"):
                num_executors = int(
                    self.sc.getConf().get("spark.executor.instances"))
            elif self.sc.getConf().contains("spark.cores.max"):
                import math
                num_executors = math.floor(
                    int(self.sc.getConf().get("spark.cores.max")) /
                    self.ray_node_cpu_cores)
            else:
                num_executors = None
            if num_ray_nodes:
                num_ray_nodes = int(num_ray_nodes)
                if num_executors and num_ray_nodes > num_executors:
                    warnings.warn(
                        "num_ray_nodes is larger than the number of Spark executors, "
                        "make sure there are enough resources on your cluster")
                self.num_ray_nodes = num_ray_nodes
            elif num_executors:
                self.num_ray_nodes = num_executors
            else:
                raise Exception(
                    "spark.executor.cores not detected in the SparkContext, "
                    "you need to manually specify num_ray_nodes and ray_node_cpu_cores "
                    "for RayContext to start ray services")

            from zoo.util.utils import detect_python_location
            self.python_loc = os.environ.get("PYSPARK_PYTHON",
                                             detect_python_location())
            self.redis_port = random.randint(
                10000, 65535) if not redis_port else int(redis_port)
            self.ray_service = RayServiceFuncGenerator(
                python_loc=self.python_loc,
                redis_port=self.redis_port,
                ray_node_cpu_cores=self.ray_node_cpu_cores,
                password=self.redis_password,
                object_store_memory=self.object_store_memory,
                verbose=self.verbose,
                env=self.env,
                include_webui=self.include_webui,
                extra_params=self.extra_params)
        RayContext._active_ray_context = self
        self.total_cores = self.num_ray_nodes * self.ray_node_cpu_cores
Exemplo n.º 7
0
def init_orca_context(cluster_mode="local",
                      cores=2,
                      memory="2g",
                      num_nodes=1,
                      init_ray_on_spark=False,
                      **kwargs):
    """
    Creates or gets a SparkContext for different Spark cluster modes (and launch Ray services
    across the cluster if necessary).

    :param cluster_mode: The mode for the Spark cluster. One of "local", "yarn-client",
           "standalone" and "spark-submit". Default to be "local".

           For "spark-submit", you are supposed to use spark-submit to submit the application.
           In this case, please set the Spark configurations through command line options or
           the properties file. You need to use "spark-submit" for yarn-cluster mode.
           To make things easier, you are recommended to use the launching scripts under
           `analytics-zoo/scripts`.

           For other cluster modes, you are recommended to install and run analytics-zoo through
           pip, which is more convenient.
    :param cores: The number of cores to be used on each node. Default to be 2.
    :param memory: The memory allocated for each node. Default to be '2g'.
    :param num_nodes: The number of nodes to be used in the cluster. Default to be 1.
           For Spark local, num_nodes should always be 1 and you don't need to change it.
    :param init_ray_on_spark: Whether to launch Ray services across the cluster.
           Default to be False and in this case the Ray cluster would be launched lazily when
           Ray is involved in Project Orca.
    :param kwargs: The extra keyword arguments used for creating SparkContext and
           launching Ray if any.

    :return: An instance of SparkContext.
    """
    cluster_mode = cluster_mode.lower()
    spark_args = {}
    for key in ["conf", "spark_log_level", "redirect_spark_log"]:
        if key in kwargs:
            spark_args[key] = kwargs[key]
    if cluster_mode == "spark-submit":
        from zoo import init_nncontext
        sc = init_nncontext(**spark_args)
    elif cluster_mode == "local":
        assert num_nodes == 1, "For Spark local mode, num_nodes should be 1"
        os.environ["SPARK_DRIVER_MEMORY"] = memory
        if "python_location" in kwargs:
            spark_args["python_location"] = kwargs["python_location"]
        from zoo import init_spark_on_local
        sc = init_spark_on_local(cores, **spark_args)
    elif cluster_mode.startswith("yarn"):  # yarn or yarn-client
        if cluster_mode == "yarn-cluster":
            raise ValueError(
                'For yarn-cluster mode, please set cluster_mode to "spark-submit" '
                'and submit the application via spark-submit instead')
        hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
        if not hadoop_conf:
            assert "hadoop_conf" in kwargs,\
                "Directory path to hadoop conf not found for yarn-client mode. Please either " \
                "specify argument hadoop_conf or set the environment variable HADOOP_CONF_DIR"
            hadoop_conf = kwargs["hadoop_conf"]
        from zoo.util.utils import detect_python_location
        python_location = detect_python_location(
        )  # /path/to/conda/envs/conda_name/bin/python
        assert "envs" in python_location, "You must use a conda environment for yarn-client mode"
        for key in [
                "driver_cores", "driver_memory",
                "extra_executor_memory_for_ray", "extra_python_lib",
                "penv_archive", "additional_archive", "hadoop_user_name",
                "spark_yarn_archive", "jars"
        ]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        from zoo import init_spark_on_yarn
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf,
                                conda_name=python_location.split("/")[-3],
                                num_executors=num_nodes,
                                executor_cores=cores,
                                executor_memory=memory,
                                **spark_args)
    elif cluster_mode == "standalone":
        for key in [
                "driver_cores", "driver_memory",
                "extra_executor_memory_for_ray", "extra_python_lib", "jars",
                "master", "enable_numa_binding"
        ]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        from zoo import init_spark_standalone
        sc = init_spark_standalone(num_executors=num_nodes,
                                   executor_cores=cores,
                                   executor_memory=memory,
                                   **spark_args)
    else:
        raise ValueError(
            "cluster_mode can only be local, yarn-client, standalone or spark-submit, "
            "but got: %s".format(cluster_mode))
    ray_args = {}
    for key in [
            "redis_port", "password", "object_store_memory", "verbose", "env",
            "extra_params", "num_ray_nodes", "ray_node_cpu_cores"
    ]:
        if key in kwargs:
            ray_args[key] = kwargs[key]
    from zoo.ray import RayContext
    ray_ctx = RayContext(sc, **ray_args)
    if init_ray_on_spark:
        driver_cores = 0  # This is the default value.
        if "driver_cores" in kwargs:
            driver_cores = kwargs["driver_cores"]
        ray_ctx.init(driver_cores=driver_cores)
    return sc
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
    parser.add_argument('data', metavar='DIR', help='path to dataset')
    parser.add_argument('-a',
                        '--arch',
                        metavar='ARCH',
                        default='resnet18',
                        choices=model_names,
                        help='model architecture: ' + ' | '.join(model_names) +
                        ' (default: resnet18)')
    parser.add_argument('--epochs',
                        default=90,
                        type=int,
                        metavar='N',
                        help='number of total epochs to run')
    parser.add_argument('--start-epoch',
                        default=0,
                        type=int,
                        metavar='N',
                        help='manual epoch number (useful on restarts)')
    parser.add_argument(
        '-b',
        '--batch-size',
        default=256,
        type=int,
        metavar='N',
        help='mini-batch size (default: 256), this is the total '
        'batch size of all GPUs on the current node when '
        'using Data Parallel or Distributed Data Parallel')
    parser.add_argument('--lr',
                        '--learning-rate',
                        default=0.1,
                        type=float,
                        metavar='LR',
                        help='initial learning rate',
                        dest='lr')
    parser.add_argument('--momentum',
                        default=0.9,
                        type=float,
                        metavar='M',
                        help='momentum')
    parser.add_argument('--wd',
                        '--weight-decay',
                        default=1e-4,
                        type=float,
                        metavar='W',
                        help='weight decay (default: 1e-4)',
                        dest='weight_decay')
    parser.add_argument('-p',
                        '--print-freq',
                        default=10,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 10)')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('-e',
                        '--evaluate',
                        dest='evaluate',
                        action='store_true',
                        help='evaluate model on validation set')
    parser.add_argument('--pretrained',
                        dest='pretrained',
                        action='store_true',
                        help='use pre-trained model')
    parser.add_argument('--world-size',
                        default=-1,
                        type=int,
                        help='number of nodes for distributed training')
    parser.add_argument('--rank',
                        default=-1,
                        type=int,
                        help='node rank for distributed training')
    parser.add_argument('--seed',
                        default=None,
                        type=int,
                        help='seed for initializing training. ')
    parser.add_argument('--cores',
                        default=4,
                        type=int,
                        help='num of CPUs to use.')
    parser.add_argument('--nodes',
                        default=1,
                        type=int,
                        help='num of nodes to use.')
    parser.add_argument('--executor_memory',
                        default='20g',
                        type=str,
                        help='size of executor memory.')
    parser.add_argument('--driver_memory',
                        default='20g',
                        type=str,
                        help='size of driver memory.')
    parser.add_argument('--driver_cores',
                        default=1,
                        type=int,
                        help='num of driver cores to use.')
    args = parser.parse_args()
    # sc = init_nncontext()
    if os.environ.get('HADOOP_CONF_DIR') is None:
        sc = init_spark_on_local(cores=args.cores,
                                 conf={"spark.driver.memory": "20g"})
    else:
        hadoop_conf_dir = os.environ.get('HADOOP_CONF_DIR')
        num_executors = args.nodes
        executor_memory = args.executor_memory
        driver_memory = args.driver_memory
        driver_cores = args.driver_cores
        num_cores_per_executor = args.cores
        os.environ['ZOO_MKL_NUMTHREADS'] = str(num_cores_per_executor)
        os.environ['OMP_NUM_THREADS'] = str(num_cores_per_executor)
        sc = init_spark_on_yarn(
            hadoop_conf=hadoop_conf_dir,
            conda_name=detect_python_location().split("/")
            [-3],  # The name of the created conda-env
            num_executors=num_executors,
            executor_cores=num_cores_per_executor,
            executor_memory=executor_memory,
            driver_memory=driver_memory,
            driver_cores=driver_cores,
            conf={
                "spark.rpc.message.maxSize": "1024",
                "spark.task.maxFailures": "1",
                "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"
            })

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True)

    model = torchvision.models.resnet50()
    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False)

    iterationPerEpoch = int(math.ceil(float(1281167) / args.batch_size))
    step = Step(iterationPerEpoch * 30, 0.1)
    zooOptimizer = SGD(args.lr,
                       momentum=args.momentum,
                       dampening=0.0,
                       leaningrate_schedule=step,
                       weightdecay=args.weight_decay)
    zooModel = TorchModel.from_pytorch(model)
    criterion = torch.nn.CrossEntropyLoss()
    zooCriterion = TorchLoss.from_pytorch(criterion)
    estimator = Estimator(zooModel, optim_methods=zooOptimizer)
    train_featureSet = FeatureSet.pytorch_dataloader(train_loader)
    test_featureSet = FeatureSet.pytorch_dataloader(val_loader)
    estimator.train_minibatch(train_featureSet,
                              zooCriterion,
                              end_trigger=MaxEpoch(90),
                              checkpoint_trigger=EveryEpoch(),
                              validation_set=test_featureSet,
                              validation_method=[Accuracy(),
                                                 Top5Accuracy()])
Exemplo n.º 9
0
    def init_spark_standalone(self,
                              num_executors,
                              executor_cores,
                              executor_memory="10g",
                              driver_cores=4,
                              driver_memory="1g",
                              master=None,
                              extra_executor_memory_for_ray=None,
                              extra_python_lib=None,
                              conf=None,
                              jars=None):
        import subprocess
        import pyspark
        from zoo.util.utils import get_node_ip
        from zoo.util.engine import get_analytics_zoo_classpath
        from bigdl.util.engine import get_bigdl_classpath

        if 'PYSPARK_PYTHON' not in os.environ:
            os.environ["PYSPARK_PYTHON"] = detect_python_location()
        if not master:
            pyspark_home = os.path.abspath(pyspark.__file__ + "/../")
            zoo_standalone_home = os.path.abspath(
                __file__ + "/../../share/bin/standalone")
            node_ip = get_node_ip()
            SparkRunner.standalone_env = {
                "SPARK_HOME": pyspark_home,
                "ZOO_STANDALONE_HOME": zoo_standalone_home,
                # If not set this, by default master is hostname but not ip,
                "SPARK_MASTER_HOST": node_ip
            }
            # The scripts installed from pip don't have execution permission
            # and need to first give them permission.
            pro = subprocess.Popen(
                ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)])
            os.waitpid(pro.pid, 0)
            # Start master
            start_master_pro = subprocess.Popen(
                "{}/sbin/start-master.sh".format(zoo_standalone_home),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_master_pro.pid, 0)
            master = "spark://{}:7077".format(
                node_ip)  # 7077 is the default port
            # Start worker
            start_worker_pro = subprocess.Popen(
                "{}/sbin/start-worker.sh {}".format(zoo_standalone_home,
                                                    master),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_worker_pro.pid, 0)
        else:  # A Spark standalone cluster has already been started by the user.
            assert master.startswith("spark://"), \
                "Please input a valid master address for your Spark standalone cluster: " \
                "spark://master:port"

        # Start pyspark-shell
        submit_args = " --master " + master
        submit_args = submit_args + " --driver-cores {} --driver-memory {} --num-executors {}" \
                                    " --executor-cores {} --executor-memory {}"\
            .format(driver_cores, driver_memory, num_executors, executor_cores, executor_memory)
        if extra_python_lib:
            submit_args = submit_args + " --py-files {}".format(
                extra_python_lib)
        if jars:
            submit_args = submit_args + " --jars {}".format(jars)
        submit_args = submit_args + " pyspark-shell"
        os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args

        zoo_bigdl_jar_path = ":".join(
            [get_analytics_zoo_classpath(),
             get_bigdl_classpath()])
        spark_conf = init_spark_conf(conf) \
            .set("spark.driver.cores", driver_cores) \
            .set("spark.driver.memory", driver_memory) \
            .set("spark.executor.instances", num_executors) \
            .set("spark.executor.cores", executor_cores) \
            .set("spark.cores.max", num_executors * executor_cores) \
            .set("spark.executorEnv.PYTHONHOME",
                 "/".join(detect_python_location().split("/")[:-2]))
        if extra_executor_memory_for_ray:
            spark_conf.set("spark.executor.memoryOverhead",
                           extra_executor_memory_for_ray)
        if spark_conf.contains("spark.executor.extraClassPath"):
            spark_conf.set(
                "spark.executor.extraClassPath",
                "{}:{}".format(zoo_bigdl_jar_path,
                               conf.get("spark.executor.extraClassPath")))
        else:
            spark_conf.set("spark.executor.extraClassPath", zoo_bigdl_jar_path)

        sc = init_nncontext(spark_conf,
                            spark_log_level=self.spark_log_level,
                            redirect_spark_log=self.redirect_spark_log)
        return sc