def testPy4jGatewayConnection(integ_spark_conf): token = "my_super_secret_token" generateSSLFiles(token) startJavaGateway(integ_spark_conf, token) spark = obtainSparkSession(token) spark.sparkContext.parallelize([1, 2, 3, 4, 5]).collect() from pysparkling import H2OContext hc = H2OContext.getOrCreate() print(hc) hc.stop()
def __init__(self, sparkSession, useH2O=False, _unit_testing=False): """ "Automagically" find the JDBC URL and establish a connection to the current Splice Machine database :param sparkSession: the sparksession object :param useH2O: whether or not to :param _unit_testing: whether or not we are unit testing """ PySpliceContext.__init__(self, self.get_jdbc_url(), sparkSession, _unit_testing) if useH2O: from pysparkling import H2OConf, H2OContext h2oConf = H2OConf(sparkSession) h2oConf.set_fail_on_unsupported_spark_param_disabled() self.hc = H2OContext.getOrCreate(sparkSession, h2oConf)
def __init__(self, sparkSession, useH2O=False, _unit_testing=False): """ Automatically find the JDBC URL and establish a connection to the current Splice Machine database :param sparkSession: the sparksession object :param useH2O: whether or not to :param _unit_testing: whether or not we are unit testing """ try: url = os.environ['JDBC_URL'] PySpliceContext.__init__(self, url, sparkSession, _unit_testing) except Exception as e: print(e) print( 'The SpliceMLContext is only for use on the cloud service. Please import and use the PySpliceContext instead.\nUsage:\n\tfrom splicemachine.spark.context import PySpliceContext\n\tsplice = PySpliceContext(jdbc_url, sparkSession)' ) return -1 if useH2O: from pysparkling import H2OConf, H2OContext h2oConf = H2OConf(sparkSession) h2oConf.set_fail_on_unsupported_spark_param_disabled() self.hc = H2OContext.getOrCreate(sparkSession, h2oConf)
def get_or_create_h2o_sparkling(h2o_context_params=None, h2o_log_level="ERROR", spark_session_params=None): """ Gets or initiates an H2OSparkling session. :param dict h2o_context_params: The parameters based on which the H2OSparkling session is to be initialized :param string h2o_log_level: The log level of the H2OSparkling Session :param dict spark_session_params: The parameters based on which the Spark session is to be initialized :return: """ from pysparkling import H2OConf, H2OContext # Start SparkSession #TODO possibly change this to create spark session outside and pass "spark" as variable from mercury_ml.spark.session import get_or_create_spark_session if not spark_session_params: spark_session_params = {} spark = get_or_create_spark_session(**spark_session_params) # Start H2OContext h2o_conf = H2OConf(spark) h2o_conf.set_h2o_node_log_level(h2o_log_level) if not h2o_context_params: h2o_context_params = {} if h2o_context_params.get("auth"): # requires h2o-pysparkling>=2.2.28 h2o_context_params["auth"] = tuple(h2o_context_params["auth"]) h2o_context = H2OContext.getOrCreate(spark, conf=h2o_conf, **h2o_context_params) return h2o_context
from h2o.estimators.xgboost import H2OXGBoostEstimator from h2o.estimators.deeplearning import H2ODeepLearningEstimator from pyspark.sql import SparkSession from pysparkling import H2OContext from user_definition import * ss = SparkSession.builder.config('spark.ext.h2o.log.level', 'FATAL').getOrCreate() ss.sparkContext.setLogLevel('OFF') hc = H2OContext.getOrCreate() # step 1 # create spark dataframe train_df = ss.read.parquet(train_folder).repartition(8).cache() valid_df = ss.read.parquet(valid_folder).repartition(8).cache() # convert spark dataframe to h2oFrame train_h2o = hc.asH2OFrame(train_df, "train") valid_h2o = hc.asH2OFrame(valid_df, "valid") # convert label column to categorical datatype train_h2o['label'] = train_h2o['label'].asfactor() valid_h2o['label'] = valid_h2o['label'].asfactor() for i in train_h2o.types: # dict print(f"{i} - {train_h2o.types[i]}") print('') # step 2 predictors = train_h2o.names[:]
sc.version # In[1]: sc.addPyFile("/Users/dt216661/sparkling-water-2.4.5/py/build/dist/h2o_pysparkling_2.4-2.4.5.zip") # In[2]: import h2o from pysparkling import H2OContext h2o.__version__ hc = H2OContext.getOrCreate(spark) print(hc) # # 1.Start H2O cluster inside the Spark environment # In[4]: from pysparkling import * hc = H2OContext.getOrCreate(spark) # # 2. Parse the data using H2O and convert them to Spark Frame # In[6]: