def __init__(self, tmpdir, log, quiet, append, branching_factor, skip_logging_configuration, optimizer_iterations): super(LocalBackend, self).__init__() spark_home = find_spark_home() hail_jar_path = os.environ.get('HAIL_JAR') if hail_jar_path is None: if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") else: raise RuntimeError( 'local backend requires a packaged jar or HAIL_JAR to be set' ) port = launch_gateway(redirect_stdout=sys.stdout, redirect_stderr=sys.stderr, jarpath=f'{spark_home}/jars/py4j-0.10.9.jar', classpath=f'{spark_home}/jars/*:{hail_jar_path}', die_on_exit=True) self._gateway = JavaGateway( gateway_parameters=GatewayParameters(port=port, auto_convert=True)) self._jvm = self._gateway.jvm hail_package = getattr(self._jvm, 'is').hail self._hail_package = hail_package self._utils_package_object = scala_package_object(hail_package.utils) self._jbackend = hail_package.backend.local.LocalBackend.apply(tmpdir) self._jhc = hail_package.HailContext.apply(self._jbackend, log, True, append, branching_factor, skip_logging_configuration, optimizer_iterations) # This has to go after creating the SparkSession. Unclear why. # Maybe it does its own patch? install_exception_handler() from hail.context import version py_version = version() jar_version = self._jhc.version() if jar_version != py_version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {py_version}") self._fs = LocalFS() self._logger = None if not quiet: connect_logger(self._utils_package_object, 'localhost', 12888)
def __init__(self, tmpdir, log, quiet, append, branching_factor, skip_logging_configuration, optimizer_iterations): SPARK_HOME = os.environ['SPARK_HOME'] HAIL_HOME = os.environ['HAIL_HOME'] port = launch_gateway( redirect_stdout=sys.stdout, redirect_stderr=sys.stderr, jarpath=f'{SPARK_HOME}/jars/py4j-0.10.7.jar', classpath= f'{SPARK_HOME}/jars/*:{HAIL_HOME}/hail/build/libs/hail-all-spark.jar', die_on_exit=True) self._gateway = JavaGateway( gateway_parameters=GatewayParameters(port=port, auto_convert=True)) self._jvm = self._gateway.jvm hail_package = getattr(self._jvm, 'is').hail self._hail_package = hail_package self._utils_package_object = scala_package_object(hail_package.utils) self._jbackend = hail_package.backend.local.LocalBackend.apply(tmpdir) self._jhc = hail_package.HailContext.apply(self._jbackend, log, True, append, branching_factor, skip_logging_configuration, optimizer_iterations) # This has to go after creating the SparkSession. Unclear why. # Maybe it does its own patch? install_exception_handler() from hail.context import version py_version = version() jar_version = self._jhc.version() if jar_version != py_version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {py_version}") self._fs = LocalFS() self._logger = None if not quiet: connect_logger(self._utils_package_object, 'localhost', 12888)
def __init__(self, idempotent, sc, spark_conf, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmpdir, local_tmpdir, skip_logging_configuration, optimizer_iterations): if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") assert os.path.exists( hail_jar_path), f'{hail_jar_path} does not exist' conf = pyspark.SparkConf() base_conf = spark_conf or {} for k, v in base_conf.items(): conf.set(k, v) jars = [hail_jar_path] if os.environ.get('HAIL_SPARK_MONITOR'): import sparkmonitor jars.append( os.path.join(os.path.dirname(sparkmonitor.__file__), 'listener.jar')) conf.set("spark.extraListeners", "sparkmonitor.listener.JupyterSparkMonitorListener") conf.set('spark.jars', ','.join(jars)) conf.set('spark.driver.extraClassPath', ','.join(jars)) conf.set('spark.executor.extraClassPath', './hail-all-spark.jar') if sc is None: pyspark.SparkContext._ensure_initialized(conf=conf) elif not quiet: sys.stderr.write( 'pip-installed Hail requires additional configuration options in Spark referring\n' ' to the path to the Hail Python module directory HAIL_DIR,\n' ' e.g. /path/to/python/site-packages/hail:\n' ' spark.jars=HAIL_DIR/hail-all-spark.jar\n' ' spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar\n' ' spark.executor.extraClassPath=./hail-all-spark.jar') else: pyspark.SparkContext._ensure_initialized() self._gateway = pyspark.SparkContext._gateway self._jvm = pyspark.SparkContext._jvm hail_package = getattr(self._jvm, 'is').hail self._hail_package = hail_package self._utils_package_object = scala_package_object(hail_package.utils) jsc = sc._jsc.sc() if sc else None if idempotent: self._jbackend = hail_package.backend.spark.SparkBackend.getOrCreate( jsc, app_name, master, local, True, min_block_size, tmpdir, local_tmpdir) self._jhc = hail_package.HailContext.getOrCreate( self._jbackend, log, True, append, branching_factor, skip_logging_configuration, optimizer_iterations) else: self._jbackend = hail_package.backend.spark.SparkBackend.apply( jsc, app_name, master, local, True, min_block_size, tmpdir, local_tmpdir) self._jhc = hail_package.HailContext.apply( self._jbackend, log, True, append, branching_factor, skip_logging_configuration, optimizer_iterations) self._jsc = self._jbackend.sc() if sc: self.sc = sc else: self.sc = pyspark.SparkContext(gateway=self._gateway, jsc=self._jvm.JavaSparkContext( self._jsc)) self._jspark_session = self._jbackend.sparkSession() self._spark_session = pyspark.sql.SparkSession(self.sc, self._jspark_session) # This has to go after creating the SparkSession. Unclear why. # Maybe it does its own patch? install_exception_handler() from hail.context import version py_version = version() jar_version = self._jhc.version() if jar_version != py_version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {py_version}") self._fs = None self._logger = None if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger(self._utils_package_object, 'localhost', 12888) self._jbackend.startProgressBar()