示例#1
0
def cleanup(tb_hdfs_path):
    hopshdfs.log('Performing cleanup')
    handle = hopshdfs.get()
    if not tb_hdfs_path == None and not tb_hdfs_path == '' and handle.exists(
            tb_hdfs_path):
        handle.delete(tb_hdfs_path)
    hopshdfs.kill_logger()
示例#2
0
def end(metric=None):
    global running
    global experiment_json
    global elastic_id
    global driver_tensorboard_hdfs_path
    global app_id
    if not running:
        raise RuntimeError(
            "An experiment is not running. Did you forget to call experiment.end()?"
        )
    try:
        if metric:
            experiment_json = util.finalize_experiment(experiment_json, None,
                                                       str(metric))
            util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                             experiment_json)
        else:
            experiment_json = util.finalize_experiment(experiment_json, None,
                                                       None)
            util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                             experiment_json)
    except:
        exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False
        handle = hopshdfs.get()

        if tensorboard.tb_pid != 0:
            subprocess.Popen(["kill", str(tensorboard.tb_pid)])

        if tensorboard.local_logdir_bool:
            local_tb = tensorboard.local_logdir_path
            util.store_local_tensorboard(local_tb, tensorboard.events_logdir)

        if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \
                and handle.exists(tensorboard.endpoint):
            handle.delete(tensorboard.endpoint)
        hopshdfs.kill_logger()
def _cleanup(tb_hdfs_path):
    handle = hopshdfs.get()
    if not tb_hdfs_path == None and not tb_hdfs_path == '' and handle.exists(
            tb_hdfs_path):
        handle.delete(tb_hdfs_path)
    hopshdfs.kill_logger()
示例#4
0
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
            app_id, run_id, None, 'horovod')

        tb_pid = 0
        tb_hdfs_path = ''

        pydoop.hdfs.dump('',
                         os.environ['EXEC_LOGFILE'],
                         user=hopshdfs.project_user())
        hopshdfs.init_logger()
        hopshdfs.log('Starting Spark executor with arguments')
        if executor_num == 0:
            tb_hdfs_path, tb_pid = tensorboard.register(
                hdfs_exec_logdir,
                hdfs_appid_logdir,
                0,
                local_logdir=local_logdir)

        gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info(
        )
        hopshdfs.log(gpu_str)
        print(gpu_str)

        #1. Download notebook file
        fs_handle = hopshdfs.get_fs()

        try:
            fd = fs_handle.open_file(nb_path, flags='r')
        except:
            fd = fs_handle.open_file(nb_path, mode='r')

        notebook = ''
        for line in fd:
            notebook += line

        path, filename = os.path.split(nb_path)
        f_nb = open(filename, "w+")
        f_nb.write(notebook)
        f_nb.flush()
        f_nb.close()

        # 2. Convert notebook to py file
        jupyter_runnable = os.path.abspath(
            os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter'
        conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename
        conversion = subprocess.Popen(conversion_cmd,
                                      shell=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        conversion.wait()
        stdout, stderr = conversion.communicate()
        print(stdout)
        print(stderr)

        # 3. Make py file runnable
        py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py'
        st = os.stat(py_runnable)
        os.chmod(py_runnable, st.st_mode | stat.S_IEXEC)

        t_gpus = threading.Thread(
            target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t_gpus.start()

        mpi_logfile_path = os.getcwd() + '/mpirun.log'
        if os.path.exists(mpi_logfile_path):
            os.remove(mpi_logfile_path)

        mpi_logfile = open(mpi_logfile_path, 'w')

        # 4. Run allreduce
        mpi_np = os.environ['MPI_NP']
        mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \
                  ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \
                  ' mpirun -np ' + str(mpi_np) + \
                  ' -bind-to none -map-by slot ' + \
                  ' -x HOROVOD_TIMELINE ' + \
                  ' -x TENSORBOARD_LOGDIR ' + \
                  ' -x NCCL_DEBUG=INFO ' + \
                  os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable
        mpi = subprocess.Popen(mpi_cmd,
                               shell=True,
                               stdout=mpi_logfile,
                               stderr=mpi_logfile,
                               preexec_fn=util.on_executor_exit('SIGTERM'))

        t_log = threading.Thread(target=print_log)
        t_log.start()

        mpi.wait()

        if devices.get_num_gpus() > 0:
            t_gpus.do_run = False
            t_gpus.join()

        return_code = mpi.returncode

        if local_logdir:
            local_tb = tensorboard.local_logdir_path
            pydoop.hdfs.put(local_tb, hdfs_exec_logdir)

        if return_code != 0:
            cleanup(tb_hdfs_path)
            t_log.do_run = False
            t_log.join()
            raise Exception('mpirun FAILED, look in the logs for the error')

        cleanup(tb_hdfs_path)
        t_log.do_run = False
        t_log.join()

        hopshdfs.kill_logger()