Exemplo n.º 1
0
def _setup_logging(reporter: Reporter, log_dir: str) -> Tuple[str, str]:
    """Sets up logging directories and files.

    :param reporter: Reporter responsible for logging.
    :param log_dir: Log directory path on the file system.

    :returns: Tuple containing the path of the tensorboard directory
        and the trial log file.
    """
    reporter.set_trial_id(0)
    tb_logdir = log_dir + "/" + "training_logs_" + str(reporter.partition_id)
    trial_log_file = tb_logdir + "/output.log"
    reporter.set_trial_id(0)
    # If trial is repeated, delete trial directory, except log file
    if EnvSing.get_instance().exists(tb_logdir):
        util.clean_dir(tb_logdir, [trial_log_file])
    else:
        EnvSing.get_instance().mkdir(tb_logdir)
    reporter.init_logger(trial_log_file)
    return tb_logdir, trial_log_file
Exemplo n.º 2
0
    def wrapper_function(_: Any) -> None:
        """Patched function from dist_executor_fn factory.

        :param _: Necessary catch for the iterator given by Spark to the
        function upon foreach calls. Can safely be disregarded.
        """
        EnvSing.get_instance().set_ml_id(app_id, run_id)
        partition_id, _ = util.get_partition_attempt_id()
        client = Client(server_addr, partition_id, 0, hb_interval, secret)
        log_file = log_dir + "/executor_" + str(partition_id) + ".log"

        builtin_print = __builtin__.print
        reporter = Reporter(log_file, partition_id, 0, builtin_print)

        def maggy_print(*args, **kwargs):
            builtin_print(*args, **kwargs)
            reporter.log(" ".join(str(x) for x in args), True)

        __builtin__.print = maggy_print

        try:
            _register_with_servers(client, reporter, partition_id)
            tb_logdir, trial_log_file = _setup_logging(reporter, log_dir)
            reporter.log("Awaiting worker reservations.", True)
            client.await_reservations()
            reporter.log("Reservations complete, configuring PyTorch.", True)
            master_config = client.get_exec_config()[0]
            if not master_config:
                reporter.log("RuntimeError: PyTorch registration failed.",
                             True)
                raise RuntimeError("PyTorch registration failed.")
            addr, port = master_config["host_port"].split(":")
            torch_config = {
                "MASTER_ADDR": addr,
                "MASTER_PORT": port,
                "WORLD_SIZE": str(master_config["num_executors"]),
                "RANK": str(partition_id),
                "LOCAL_RANK": str(0),  # DeepSpeed requires local rank.
                "NCCL_BLOCKING_WAIT": "1",
                "NCCL_DEBUG": "INFO",
            }
            tensorboard._register(tb_logdir)
            reporter.log(f"Torch config is {torch_config}", True)

            _setup_torch_env(torch_config)
            _sanitize_config(config)
            _init_cluster(timeout=60, random_seed=0)
            module = _wrap_module_dispatcher(config)
            _monkey_patch_pytorch(config.zero_lvl)

            reporter.log("Starting distributed training.", True)
            sig = inspect.signature(train_fn)
            if sig.parameters.get("reporter", None):
                retval = train_fn(
                    module=module,
                    hparams=config.hparams,
                    train_set=config.train_set,
                    test_set=config.test_set,
                    reporter=reporter,
                )
            else:
                retval = train_fn(
                    module=module,
                    hparams=config.hparams,
                    train_set=config.train_set,
                    test_set=config.test_set,
                )

            retval = util.handle_return_val(retval, tb_logdir, "Metric",
                                            trial_log_file)
            dist.barrier(
            )  # Don't exit until all executors are done (else NCCL crashes)
            reporter.log("Finished distributed training.", True)
            client.finalize_metric(retval, reporter)
        except:  # noqa: E722
            reporter.log(traceback.format_exc())
            raise
        finally:
            reporter.close_logger()
            client.stop()
            client.close()
Exemplo n.º 3
0
    def _wrapper_fun(iter):
        """
        Wraps the user supplied training function in order to be passed to the
        Spark Executors.

        Args:
            iter:

        Returns:

        """
        experiment_utils._set_ml_id(app_id, run_id)

        # get task context information to determine executor identifier
        partition_id, task_attempt = util.get_partition_attempt_id()

        client = rpc.Client(server_addr, partition_id, task_attempt,
                            hb_interval, secret)
        log_file = (log_dir + "/executor_" + str(partition_id) + "_" +
                    str(task_attempt) + ".log")

        # save the builtin print
        original_print = __builtin__.print

        reporter = Reporter(log_file, partition_id, task_attempt,
                            original_print)

        def maggy_print(*args, **kwargs):
            """Maggy custom print() function."""
            original_print(*args, **kwargs)
            reporter.log(" ".join(str(x) for x in args), True)

        # override the builtin print
        __builtin__.print = maggy_print

        try:
            client_addr = client.client_addr

            host_port = client_addr[0] + ":" + str(client_addr[1])

            exec_spec = {}
            exec_spec["partition_id"] = partition_id
            exec_spec["task_attempt"] = task_attempt
            exec_spec["host_port"] = host_port
            exec_spec["trial_id"] = None

            reporter.log("Registering with experiment driver", False)
            client.register(exec_spec)

            client.start_heartbeat(reporter)

            # blocking
            trial_id, parameters = client.get_suggestion(reporter)

            while not client.done:
                if experiment_type == "ablation":
                    ablation_params = {
                        "ablated_feature":
                        parameters.get("ablated_feature", "None"),
                        "ablated_layer":
                        parameters.get("ablated_layer", "None"),
                    }
                    parameters.pop("ablated_feature")
                    parameters.pop("ablated_layer")

                tb_logdir = log_dir + "/" + trial_id
                trial_log_file = tb_logdir + "/output.log"
                reporter.set_trial_id(trial_id)

                # If trial is repeated, delete trial directory, except log file
                if hopshdfs.exists(tb_logdir):
                    util._clean_dir(tb_logdir, [trial_log_file])
                else:
                    hopshdfs.mkdir(tb_logdir)

                reporter.init_logger(trial_log_file)
                tensorboard._register(tb_logdir)
                if experiment_type == "ablation":
                    hopshdfs.dump(
                        json.dumps(ablation_params,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                else:
                    hopshdfs.dump(
                        json.dumps(parameters,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                try:
                    reporter.log("Starting Trial: {}".format(trial_id), False)
                    reporter.log("Trial Configuration: {}".format(parameters),
                                 False)

                    if experiment_type == "optimization":
                        tensorboard._write_hparams(parameters, trial_id)

                    sig = inspect.signature(map_fun)
                    if sig.parameters.get("reporter", None):
                        retval = map_fun(**parameters, reporter=reporter)
                    else:
                        retval = map_fun(**parameters)

                    if experiment_type == "optimization":
                        tensorboard._write_session_end()

                    retval = util._handle_return_val(retval, tb_logdir,
                                                     optimization_key,
                                                     trial_log_file)

                except exceptions.EarlyStopException as e:
                    retval = e.metric
                    reporter.log("Early Stopped Trial.", False)

                reporter.log("Finished Trial: {}".format(trial_id), False)
                reporter.log("Final Metric: {}".format(retval), False)
                client.finalize_metric(retval, reporter)

                # blocking
                trial_id, parameters = client.get_suggestion(reporter)

        except:  # noqa: E722
            reporter.log(traceback.format_exc(), False)
            raise
        finally:
            reporter.close_logger()
            client.stop()
            client.close()
Exemplo n.º 4
0
    def spark_wrapper_function(_: Any) -> None:
        """Patched function from tf_dist_executor_fn factory.

        :param _: Necessary catch for the iterator given by Spark to the
        function upon foreach calls. Can safely be disregarded.
        """
        EnvSing.get_instance().set_ml_id(app_id, run_id)
        partition_id, _ = util.get_partition_attempt_id()
        client = EnvSing.get_instance().get_client(
            server_addr,
            partition_id,
            hb_interval,
            secret,
            socket.socket(socket.AF_INET, socket.SOCK_STREAM),
        )
        log_file = log_dir + "/executor_" + str(partition_id) + ".log"

        reporter = Reporter(log_file, partition_id, 0, __builtin__.print)
        builtin_print = __builtin__.print
        _setup_logging(reporter, log_dir)

        def maggy_print(*args, **kwargs):
            builtin_print(*args, **kwargs)
            reporter.log(" ".join(str(x) for x in args), True)

        __builtin__.print = maggy_print

        try:
            host = EnvSing.get_instance().get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(("", 0))
            port = tmp_socket.getsockname()[1] + 1

            host_port = host + ":" + str(port)

            _register_with_servers(client, reporter, partition_id)
            tb_logdir, trial_log_file = _setup_logging(reporter, log_dir)
            tensorboard._register(tb_logdir)

            reporter.log("Awaiting worker reservations.")
            client.await_reservations()

            reservations = client.get_message("RESERVATIONS")
            reporter.log(reservations)
            reporter.log(host_port)
            reporter.log("Reservations complete, configuring Tensorflow.")

            if not reservations:
                reporter.log("Tensorflow registration failed, exiting from all tasks.")
                return

            workers_host_port = []

            for i in list(reservations["cluster"]):
                if len(reservations["cluster"][i]) > 0:
                    workers_host_port.append(reservations["cluster"][i][0])

            is_chief = False
            task_index = find_index(host_port, reservations)
            tf_config = reservations
            if task_index == -1:
                tf_config["task"] = {"type": "chief", "index": 0}
                is_chief = True
            else:
                tf_config["task"] = {"type": "worker", "index": task_index}

            last_worker_index = len(reservations["cluster"]["worker"]) - 1
            if not last_worker_index < 0:
                evaluator_node = reservations["cluster"]["worker"][last_worker_index]
                reservations["cluster"]["evaluator"] = [evaluator_node]
                del reservations["cluster"]["worker"][last_worker_index]
                if evaluator_node == host_port:
                    tf_config["task"] = {"type": "evaluator", "index": 0}

            reporter.log(f"Tensorflow config is {tf_config}")

            _setup_tf_config(tf_config)

            strategy = tf.distribute.MultiWorkerMirroredStrategy
            model = _wrap_model(config, strategy, is_chief)

            if config.dataset is not None and config.process_data is not None:
                config.dataset = _consume_data(config)

            reporter.log(f"index of slice {partition_id}")
            reporter.log("Starting distributed training.")
            sig = inspect.signature(train_fn)

            kwargs = {}
            if sig.parameters.get("model", None):
                kwargs["model"] = model
            if sig.parameters.get("dataset", None):
                kwargs["dataset"] = config.dataset
            if sig.parameters.get("hparams", None):
                kwargs["hparams"] = config.hparams

            if sig.parameters.get("reporter", None):
                kwargs["reporter"] = reporter
                retval = train_fn(**kwargs)
            else:
                retval = train_fn(**kwargs)

            # Set retval to work with util.handle_return_value,
            # if there is more than 1 metrics, retval will be a list and
            # retval[0] will contain the final loss
            retval_list = []
            if isinstance(retval, dict):
                for item in retval.items():
                    retval_list.append(item[1])
                retval = retval_list
            retval = {"Metric": retval[0] if isinstance(retval, list) else retval}
            retval = util.handle_return_val(retval, tb_logdir, "Metric", trial_log_file)
            reporter.log("Finished distributed training.")
            client.finalize_metric(retval, reporter)
        except:  # noqa: E722
            reporter.log(traceback.format_exc())
            raise
        finally:
            reporter.close_logger()
            client.stop()
            client.close()
Exemplo n.º 5
0
    def python_wrapper_function(_: Any) -> None:
        """Patched function from tf_dist_executor_fn factory.

        :param _: Necessary catch for the iterator given by Spark to the
        function upon foreach calls. Can safely be disregarded.
        """
        EnvSing.get_instance().set_ml_id(app_id, run_id)
        partition_id, _ = util.get_partition_attempt_id()

        log_file = log_dir + "/executor_" + str(partition_id) + ".log"

        reporter = Reporter(log_file, partition_id, 0, __builtin__.print)
        builtin_print = __builtin__.print

        def maggy_print(*args, **kwargs):
            builtin_print(*args, **kwargs)
            reporter.log(" ".join(str(x) for x in args), True)

        __builtin__.print = maggy_print

        try:
            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(("", 0))

            tb_logdir, trial_log_file = _setup_logging(reporter, log_dir)
            tensorboard._register(tb_logdir)
            tf_config = None

            physical_devices = tf.config.list_physical_devices("GPU")
            if physical_devices is not None:
                strategy = tf.distribute.MultiWorkerMirroredStrategy
                for count, pd in enumerate(physical_devices):
                    if pd == "/gpu:0":
                        tf_config["task"] = {"type": "chief", "index": 0}
                    else:
                        tf_config["task"] = {"type": "worker", "index": count}
            else:  # Use the Default Strategy
                strategy = tf.distribute.get_strategy

            model = _wrap_model(config, strategy, False)

            if config.dataset is not None and config.process_data is not None:
                config.dataset = _consume_data(config)

            reporter.log(f"index of slice {partition_id}")
            reporter.log("Starting distributed training.")
            sig = inspect.signature(train_fn)

            kwargs = {}
            if sig.parameters.get("model", None):
                kwargs["model"] = model
            if sig.parameters.get("dataset", None):
                kwargs["dataset"] = config.dataset
            if sig.parameters.get("hparams", None):
                kwargs["hparams"] = config.hparams

            if sig.parameters.get("reporter", None):
                kwargs["reporter"] = reporter
                retval = train_fn(**kwargs)
            else:
                retval = train_fn(**kwargs)

            # Set retval to work with util.handle_return_value,
            # if there is more than 1 metrics, retval will be a list and
            # retval[0] will contain the final loss
            retval_list = []
            if isinstance(retval, dict):
                for item in retval.items():
                    retval_list.append(item[1])
                retval = retval_list
            retval = {"Metric": retval[0] if isinstance(retval, list) else retval}
            retval = util.handle_return_val(retval, tb_logdir, "Metric", trial_log_file)
            reporter.log("Finished distributed training.")
        except:  # noqa: E722
            reporter.log(traceback.format_exc())
            raise
        finally:
            reporter.close_logger()
            __builtin__.print = builtin_print
        return retval
Exemplo n.º 6
0
    def _wrapper_fun(_: Any) -> None:
        """Patched function from trial_executor_fn factory.

        :param _: Necessary catch for the iterator given by Spark to the
        function upon foreach calls. Can safely be disregarded.
        """
        env = EnvSing.get_instance()

        env.set_ml_id(app_id, run_id)

        # get task context information to determine executor identifier
        partition_id, task_attempt = util.get_partition_attempt_id()

        client = EnvSing.get_instance().get_client(
            server_addr,
            partition_id,
            hb_interval,
            secret,
            socket.socket(socket.AF_INET, socket.SOCK_STREAM),
        )
        log_file = (log_dir + "/executor_" + str(partition_id) + "_" +
                    str(task_attempt) + ".log")

        # save the builtin print
        original_print = __builtin__.print

        reporter = Reporter(log_file, partition_id, task_attempt,
                            original_print)

        def maggy_print(*args, **kwargs):
            """Maggy custom print() function."""
            original_print(*args, **kwargs)
            reporter.log(" ".join(str(x) for x in args), True)

        # override the builtin print
        __builtin__.print = maggy_print

        try:
            client_addr = client.client_addr

            host_port = client_addr[0] + ":" + str(client_addr[1])

            exec_spec = {}
            exec_spec["partition_id"] = partition_id
            exec_spec["task_attempt"] = task_attempt
            exec_spec["host_port"] = host_port
            exec_spec["trial_id"] = None

            reporter.log("Registering with experiment driver", False)
            client.register(exec_spec)

            client.start_heartbeat(reporter)

            # blocking
            trial_id, parameters = client.get_suggestion(reporter)
            while not client.done:
                if experiment_type == "ablation":
                    ablation_params = {
                        "ablated_feature":
                        parameters.get("ablated_feature", "None"),
                        "ablated_layer":
                        parameters.get("ablated_layer", "None"),
                    }
                    parameters.pop("ablated_feature")
                    parameters.pop("ablated_layer")

                tb_logdir = log_dir + "/" + trial_id
                trial_log_file = tb_logdir + "/output.log"
                reporter.set_trial_id(trial_id)

                # If trial is repeated, delete trial directory, except log file
                if env.exists(tb_logdir):
                    util.clean_dir(tb_logdir, [trial_log_file])
                else:
                    env.mkdir(tb_logdir)

                reporter.init_logger(trial_log_file)
                tensorboard._register(tb_logdir)
                if experiment_type == "ablation":
                    env.dump(
                        json.dumps(ablation_params,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                else:
                    env.dump(
                        json.dumps(parameters,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                model = config.model
                dataset = config.dataset

                try:
                    reporter.log("Starting Trial: {}".format(trial_id), False)
                    reporter.log("Trial Configuration: {}".format(parameters),
                                 False)

                    if experiment_type == "optimization":
                        tensorboard._write_hparams(parameters, trial_id)

                    sig = inspect.signature(train_fn)
                    kwargs = {}
                    if sig.parameters.get("model", None):
                        kwargs["model"] = model
                    if sig.parameters.get("dataset", None):
                        kwargs["dataset"] = dataset
                    if sig.parameters.get("hparams", None):
                        kwargs["hparams"] = parameters

                    if sig.parameters.get("reporter", None):
                        kwargs["reporter"] = reporter
                        retval = train_fn(**kwargs)
                    else:
                        retval = train_fn(**kwargs)

                    # todo: test this change
                    retval_list = []
                    if isinstance(retval, dict):
                        for item in retval.items():
                            retval_list.append(item[1])
                        retval = retval_list
                    retval = {
                        "Metric":
                        retval[0] if isinstance(retval, list) else retval
                    }
                    retval = util.handle_return_val(retval, tb_logdir,
                                                    optimization_key,
                                                    trial_log_file)

                except exceptions.EarlyStopException as e:
                    retval = e.metric
                    reporter.log("Early Stopped Trial.", False)

                reporter.log("Finished Trial: {}".format(trial_id), False)
                reporter.log("Final Metric: {}".format(retval), False)
                client.finalize_metric(retval, reporter)

                # blocking
                trial_id, parameters = client.get_suggestion(reporter)

        except:  # noqa: E722
            reporter.log(traceback.format_exc(), False)
            raise
        finally:
            reporter.close_logger()
            client.stop()
            client.close()