示例#1
0
def _restart_debugging(interactive=True):
    """

    Args:
        interactive:

    Returns:

    """
    global tb_pid

    #Kill existing TB
    proc = subprocess.Popen(["kill", str(tb_pid)])
    proc.wait()

    debugger_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    debugger_socket.bind(('', 0))
    debugger_addr, debugger_port = debugger_socket.getsockname()

    debugger_socket.close()

    tb_env = os.environ.copy()
    tb_env['CUDA_VISIBLE_DEVICES'] = ''
    tb_env['LC_ALL'] = 'C'
    tb_env['TMPDIR'] = os.getcwd()

    global pypath
    global tb_path
    global tb_port

    if interactive:
        tb_proc = subprocess.Popen(
            [
                pypath, tb_path,
                "--logdir=%s" % logdir(),
                "--port=%d" % tb_port,
                "--debugger_port=%d" % debugger_port,
                "--host=%s" % "0.0.0.0"
            ],
            env=tb_env,
            preexec_fn=util._on_executor_exit('SIGTERM'))
        tb_pid = tb_proc.pid

    if not interactive:
        tb_proc = subprocess.Popen(
            [
                pypath, tb_path,
                "--logdir=%s" % logdir(),
                "--port=%d" % tb_port,
                "--debugger_data_server_grpc_port=%d" % debugger_port,
                "--host=%s" % "0.0.0.0"
            ],
            env=tb_env,
            preexec_fn=util._on_executor_exit('SIGTERM'))
        tb_pid = tb_proc.pid

    time.sleep(2)

    return 'localhost:' + str(debugger_port)
示例#2
0
def visualize(hdfs_root_logdir):
    """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize
    them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs.

    Args:
      :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard
    """

    sc = util._find_spark().sparkContext
    app_id = str(sc.applicationId)

    pypath = os.getenv("PYSPARK_PYTHON")

    logdir = os.getcwd() + '/tensorboard_events/'
    if os.path.exists(logdir):
        shutil.rmtree(logdir)
        os.makedirs(logdir)
    else:
        os.makedirs(logdir)

        #find free port
    tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    tb_socket.bind(('', 0))
    tb_addr, tb_port = tb_socket.getsockname()

    tb_path = util._find_tensorboard()

    tb_socket.close()

    tb_env = os.environ.copy()
    tb_env['CUDA_VISIBLE_DEVICES'] = ''
    tb_env['LC_ALL'] = 'C'

    tb_proc = subprocess.Popen([
        pypath, tb_path,
        "--logdir=%s" % logdir,
        "--port=%d" % tb_port,
        "--host=%s" % "0.0.0.0"
    ],
                               env=tb_env,
                               preexec_fn=util._on_executor_exit('SIGTERM'))

    host = socket.gethostname()
    tb_url = "http://{0}:{1}".format(host, tb_port)
    tb_endpoint = hopshdfs._get_experiments_dir(
    ) + "/" + app_id + "/TensorBoard.visualize"
    #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user())

    handle = hopshdfs.get()
    hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir)
    for entry in hdfs_logdir_entries:
        file_name, extension = splitext(entry['name'])
        if not extension == '.log':
            pydoop.hdfs.get(entry['name'], logdir)

    tb_proc.wait()
    stdout, stderr = tb_proc.communicate()
    print(stdout)
    print(stderr)
示例#3
0
def start_beam_jobserver(flink_session_name,
                         artifacts_dir="Resources",
                         jobserver_jar=None,
                         sdk_worker_parallelism=1):
    """
    Start the Java Beam job server that connects to the flink session cluster. User needs to provide the
    job name that started the Flink session and optionally the worker parallelism.

    Args:
      :flink_session_name: Job name that runs the Flink session.
      :sdk_worker_parallelism: Default parallelism for SDK worker processes. This option is only applied when the
      pipeline option sdkWorkerParallelism is set to 0.Default is 1, If 0, worker parallelism will be dynamically
      decided by runner.See also: sdkWorkerParallelism Pipeline Option (default: 1). For further documentation,
      please refer to Apache Beam docs.
    Returns:
        artifact_port, expansion_port, job_host, job_port, jobserver.pid
    """
    if jobserver_jar is None:
        jobserver_jar = os.path.join(
            util.get_flink_conf_dir(),
            "beam-runners-flink-1.8-job-server-2.15.0.jar")
    # Get Flink master URL (flink session cluster) from an ExecutionDTO
    method = constants.HTTP_CONFIG.HTTP_GET
    resource_url = constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_PROJECT_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   hopsfs.project_id() + constants.DELIMITERS.SLASH_DELIMITER + \
                   "jobs" + constants.DELIMITERS.SLASH_DELIMITER + \
                   flink_session_name + constants.DELIMITERS.SLASH_DELIMITER + \
                   "executions" + \
                   "?limit=1&offset=0&sort_by=submissionTime:desc"
    response = util.send_request(method, resource_url)
    response_object = response.json()
    flink_master_url = response_object['items'][0]['flinkMasterURL']
    artifact_port = randint(10000, 65000)
    expansion_port = randint(10000, 65000)
    job_port = randint(10000, 65000)
    job_host = socket.getfqdn()
    log_base_path = ""
    if 'LOG_DIRS' in os.environ:
        log_base_path += os.environ['LOG_DIRS'] + "/"

    beam_jobserver_log = log_base_path + "beamjobserver-" + hopsfs.project_name().lower() + "-" + flink_session_name + \
                          "-" + str(job_port) + ".log"
    # copy jar to local
    with open(beam_jobserver_log, "wb") as out, open(beam_jobserver_log,
                                                     "wb") as err:
        jobserver = subprocess.Popen(
            [
                "java", "-jar", jobserver_jar,
                "--artifacts-dir=%s" % hopsfs.project_path() + artifacts_dir,
                "--flink-master-url=%s" % flink_master_url,
                "--artifact-port=%d" % artifact_port,
                "--expansion-port=%d" % expansion_port,
                "--job-host=%s" % job_host,
                "--job-port=%d" % job_port,
                "--sdk-worker-parallelism=%d" % sdk_worker_parallelism
            ],
            stdout=out,
            stderr=err,
            preexec_fn=util._on_executor_exit('SIGTERM'))
    global clusters
    clusters.append(flink_session_name)
    global jobserver_host
    jobserver_host = job_host
    global jobserver_port
    jobserver_port = job_port
    return {
        "jobserver_log": beam_jobserver_log,
        "artifact_port": artifact_port,
        "expansion_port": expansion_port,
        "job_host": job_host,
        "job_port": job_port,
        "jobserver.pid": jobserver.pid
    }
示例#4
0
def _register(hdfs_exec_dir, endpoint_dir, exec_num, local_logdir=False):
    """

    Args:
        hdfs_exec_dir:
        endpoint_dir:
        exec_num:
        local_logdir:

    Returns:

    """
    global tb_pid

    if tb_pid != 0:
        subprocess.Popen(["kill", str(tb_pid)])

    _reset_global()

    global events_logdir
    events_logdir = hdfs_exec_dir

    global local_logdir_bool
    local_logdir_bool = local_logdir

    if tb_pid == 0:
        global pypath
        pypath = os.getenv("PYSPARK_PYTHON")

        #find free port
        tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        tb_socket.bind(('', 0))
        global tb_port
        tb_addr, tb_port = tb_socket.getsockname()

        global tb_path
        tb_path = experiment_utils._find_tensorboard()

        tb_socket.close()

        tb_env = _init_tb_env()

        global local_logdir_path
        if local_logdir:
            local_logdir_path = os.getcwd() + '/local_logdir'
            if os.path.exists(local_logdir_path):
                shutil.rmtree(local_logdir_path)
                os.makedirs(local_logdir_path)
            else:
                os.makedirs(local_logdir_path)

            local_logdir_path = local_logdir_path + '/'
            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % local_logdir_path,
                    "--port=%d" % tb_port,
                    "--host=%s" % "0.0.0.0"
                ],
                env=tb_env,
                preexec_fn=util._on_executor_exit('SIGTERM'))
        else:
            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % events_logdir,
                    "--port=%d" % tb_port,
                    "--host=%s" % "0.0.0.0"
                ],
                env=tb_env,
                preexec_fn=util._on_executor_exit('SIGTERM'))

        tb_pid = tb_proc.pid

        host = socket.gethostname()
        global tb_url
        tb_url = "http://{0}:{1}".format(host, tb_port)
        global endpoint
        endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num)

        #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, endpoint, user=hopshdfs.project_user())

    return endpoint, tb_pid
示例#5
0
def start_beam_jobserver(flink_session_name,
                         artifacts_dir="Resources",
                         jobserver_jar=None):
    """
    Start the Java Beam job server that connects to the flink session cluster. User needs to provide the
    job name that started the Flink session and optionally the worker parallelism.

    Args:
      :flink_session_name: Job name that runs the Flink session.
      :artifacts_dir: Default dataset to store artifacts.
      :jobserver_jar: Portability framework jar filename.
    Returns:
        artifact_port, expansion_port, job_host, job_port, jobserver.pid
    """
    if jobserver_jar is None:
        jobserver_jar = os.path.join(util.get_flink_conf_dir(), "beam-runners-flink-1.9-job-server-2.19.0.jar")
    # Get Flink master URL (flink session cluster) from an ExecutionDTO
    method = constants.HTTP_CONFIG.HTTP_GET
    resource_url = constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_PROJECT_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   hopsfs.project_id() + constants.DELIMITERS.SLASH_DELIMITER + \
                   "jobs" + constants.DELIMITERS.SLASH_DELIMITER + \
                   flink_session_name + constants.DELIMITERS.SLASH_DELIMITER + \
                   "executions" + \
                   "?limit=1&offset=0&sort_by=submissionTime:desc"
    response = util.send_request(method, resource_url)
    response_object = response.json()
    flink_master_url = response_object['items'][0]['flinkMasterURL']
    artifact_port = randint(10000, 65000)
    expansion_port = randint(10000, 65000)
    job_port = randint(10000, 65000)
    job_host = socket.getfqdn()
    log_base_path = ""
    if 'LOG_DIRS' in os.environ:
        log_base_path += os.environ['LOG_DIRS'] + "/"

    beam_jobserver_log = log_base_path + "beamjobserver-" + hopsfs.project_name().lower() + "-" + flink_session_name + \
                          "-" + str(job_port) + ".log"
    # copy jar to local
    with open(beam_jobserver_log, "wb") as out, open(beam_jobserver_log, "wb") as err:
        jobserver = subprocess.Popen(["java",
                                       "-jar", jobserver_jar,
                                       "--artifacts-dir=%s" % hopsfs.project_path() + artifacts_dir,
                                       "--flink-master-url=%s" % flink_master_url,
                                       "--artifact-port=%d" % artifact_port,
                                       "--expansion-port=%d" % expansion_port,
                                       "--job-host=%s" % job_host,
                                       "--job-port=%d" % job_port],
                                      stdout=out,
                                      stderr=err,
                                      preexec_fn=util._on_executor_exit('SIGTERM'))
    global clusters
    clusters.append(flink_session_name)
    global jobserver_host
    jobserver_host = job_host
    global jobserver_port
    jobserver_port = job_port
    return {"jobserver_log": beam_jobserver_log,
            "artifact_port": artifact_port,
            "expansion_port": expansion_port,
            "job_host": job_host,
            "job_port": job_port,
            "jobserver.pid": jobserver.pid}
示例#6
0
def start_beam_jobserver(
    flink_session_name,
    artifacts_dir="Resources",
    jobserver_jar=os.path.join(util.get_flink_lib_dir(),
                               "beam-runners-flink-1.9-job-server-2.24.0.jar"),
    jobserver_main_class="org.apache.beam.runners.flink.FlinkJobServerDriver",
    service_discover_jar=os.path.join(
        util.get_flink_lib_dir(),
        "service-discovery-client-0.5-SNAPSHOT.jar")):
    """
    Start the Java Beam job server that connects to the flink session cluster. User needs to provide the
    job name that started the Flink session and optionally the worker parallelism.

    Args:
      :flink_session_name: Job name that runs the Flink session.
      :artifacts_dir: Default dataset to store artifacts.
      :jobserver_jar: Portability framework jar filename.
    Returns:
        artifact_port, expansion_port, job_host, job_port, jobserver.pid
    """
    # Get Flink master URL (flink session cluster) from an ExecutionDTO
    method = constants.HTTP_CONFIG.HTTP_GET
    resource_url = constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_PROJECT_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   hopsfs.project_id() + constants.DELIMITERS.SLASH_DELIMITER + \
                   "jobs" + constants.DELIMITERS.SLASH_DELIMITER + \
                   flink_session_name + constants.DELIMITERS.SLASH_DELIMITER + \
                   "executions" + \
                   "?limit=1&offset=0&sort_by=submissionTime:desc"
    response = util.send_request(method, resource_url)
    response_object = response.json()
    flink_master_url = response_object['items'][0]['flinkMasterURL']
    artifact_port = randint(10000, 65000)
    expansion_port = randint(10000, 65000)
    job_port = randint(10000, 65000)
    job_host = socket.getfqdn()
    log_base_path = ""
    if 'LOG_DIRS' in os.environ:
        log_base_path += os.environ['LOG_DIRS'] + "/"

    beam_jobserver_log = log_base_path + "beamjobserver-" + hopsfs.project_name().lower() + "-" + flink_session_name + \
                          "-" + str(job_port) + ".log"
    # copy jar to local
    with open(beam_jobserver_log, "wb") as out, open(beam_jobserver_log,
                                                     "wb") as err:
        # Get the hadoop glob classpath and filter out service-discover-client as there is a shading issue with
        # jackson dependency
        jobserver_cp_list = list(
            filter(
                lambda x: "service-discovery" not in x and x.endswith(".jar"),
                util.get_hadoop_classpath_glob().split(":")))
        jobserver_cp_list.extend((service_discover_jar, jobserver_jar))
        jobserver_cp_path = ":".join(jobserver_cp_list).replace("\n", "")

        jobserver = subprocess.Popen(
            [
                "java", "-cp",
                "%s" % jobserver_cp_path, jobserver_main_class,
                "--artifacts-dir=%s" % hopsfs.project_path() + artifacts_dir,
                "--flink-master-url=%s" % flink_master_url,
                "--artifact-port=%d" % artifact_port,
                "--expansion-port=%d" % expansion_port,
                "--job-host=%s" % job_host,
                "--job-port=%d" % job_port
            ],
            stdout=out,
            stderr=err,
            preexec_fn=util._on_executor_exit('SIGTERM'))
    global clusters
    clusters.append(flink_session_name)
    global jobserver_host
    jobserver_host = job_host
    global jobserver_port
    jobserver_port = job_port
    return {
        "jobserver_log": beam_jobserver_log,
        "artifact_port": artifact_port,
        "expansion_port": expansion_port,
        "job_host": job_host,
        "job_port": job_port,
        "jobserver.pid": jobserver.pid
    }